You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2016/08/26 21:52:48 UTC

[1/6] mesos git commit: Added new TaskState values and PARTITION_AWARE capability.

Repository: mesos
Updated Branches:
  refs/heads/master 8ab7a6f4d -> 8330e99f6


Added new TaskState values and PARTITION_AWARE capability.

TASK_DROPPED, TASK_UNREACHABLE, TASK_GONE, TASK_GONE_BY_OPERATOR, and
TASK_UNKNOWN. These values are intended to replace the existing
TASK_LOST state by offering more fine-grained information on the
current state of a task. These states will only be sent to frameworks
that opt into this new behavior via the PARTITION_AWARE capability.

Note that this commit doesn't add a master metric for the TASK_UNKNOWN
status, because this is a "default" status reported when the master has
no knowledge of a particular task/agent ID. Hence the number of
"unknown" tasks at any given time is not a well-defined metric.

Review: https://reviews.apache.org/r/50699/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/c3268cad
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/c3268cad
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/c3268cad

Branch: refs/heads/master
Commit: c3268cad3621a6373ff331d882393b2ada064f4b
Parents: 8ab7a6f
Author: Neil Conway <ne...@gmail.com>
Authored: Fri Aug 26 14:47:53 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 26 14:47:53 2016 -0700

----------------------------------------------------------------------
 include/mesos/mesos.proto            | 59 ++++++++++++++++++++++++++++++-
 include/mesos/v1/mesos.proto         | 59 ++++++++++++++++++++++++++++++-
 src/common/protobuf_utils.cpp        |  8 ++++-
 src/examples/disk_full_framework.cpp |  5 +++
 src/master/http.cpp                  | 19 +++++++++-
 src/master/master.cpp                | 42 ++++++++++++++++++----
 src/master/metrics.cpp               | 16 +++++++++
 src/master/metrics.hpp               |  4 +++
 src/tests/master_tests.cpp           |  4 ++-
 9 files changed, 205 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/include/mesos/mesos.proto
----------------------------------------------------------------------
diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto
index a93db55..7fbcdf0 100644
--- a/include/mesos/mesos.proto
+++ b/include/mesos/mesos.proto
@@ -297,6 +297,24 @@ message FrameworkInfo {
       // Receive offers with resources that are shared.
       // TODO(anindya_sinha): This is currently a no-op.
       SHARED_RESOURCES = 4;
+
+      // Indicates that the framework is prepared to handle the
+      // following TaskStates: TASK_UNREACHABLE, TASK_DROPPED,
+      // TASK_GONE, TASK_GONE_BY_OPERATOR, and TASK_UNKNOWN.
+      //
+      // With this capability, frameworks can define how they would
+      // like to handle partitioned tasks. Frameworks will receive
+      // TASK_UNREACHABLE for tasks on partitioned agents; if/when the
+      // partitioned agent reregisters, the task will not be killed.
+      // Frameworks that enable this capability will never receive
+      // TASK_LOST; they will receive one of the most specific task
+      // statuses listed above instead.
+      //
+      // Without this capability, frameworks will receive TASK_LOST
+      // for tasks on partitioned agents; such tasks will be killed by
+      // Mesos when the agent reregisters (unless the master has
+      // failed over).
+      PARTITION_AWARE = 5;
     }
 
     // Enum fields should be optional, see: MESOS-4997.
@@ -1489,8 +1507,47 @@ enum TaskState {
   TASK_FINISHED = 2; // TERMINAL: The task finished successfully.
   TASK_FAILED = 3;   // TERMINAL: The task failed to finish successfully.
   TASK_KILLED = 4;   // TERMINAL: The task was killed by the executor.
-  TASK_LOST = 5;     // TERMINAL: The task failed but can be rescheduled.
   TASK_ERROR = 7;    // TERMINAL: The task description contains an error.
+
+  // This is only sent when the framework does NOT opt-in to the
+  // PARTITION_AWARE capability.
+  TASK_LOST = 5;     // TERMINAL: The task failed but can be rescheduled.
+
+  // The following task statuses are only sent when the framework
+  // opts-in to the PARTITION_AWARE capability.
+
+  // The task failed to launch because of a transient error. The
+  // task's executor never started running. Unlike TASK_ERROR, the
+  // task description is valid -- attempting to launch the task again
+  // may be successful. This is a terminal state.
+  TASK_DROPPED = 9;
+
+  // The task was running on an agent that has lost contact with the
+  // master, typically due to a network failure or partition. The task
+  // may or may not still be running.
+  TASK_UNREACHABLE = 10;
+
+  // The task was running on an agent that has been shutdown (e.g.,
+  // the agent become partitioned, rebooted, and then reconnected to
+  // the master; any tasks running before the reboot will transition
+  // from UNREACHABLE to GONE). The task is no longer running. This is
+  // a terminal state.
+  TASK_GONE = 11;
+
+  // The task was running on an agent that the master cannot contact;
+  // the operator has asserted that the agent has been shutdown, but
+  // this has not been directly confirmed by the master. If the
+  // operator is correct, the task is not running and this is a
+  // terminal state; if the operator is mistaken, the task might still
+  // be running, and might return to the RUNNING state in the future.
+  TASK_GONE_BY_OPERATOR = 12;
+
+  // The master has no knowledge of the task. This is typically
+  // because either (a) the master never had knowledge of the task, or
+  // (b) the master forgot about the task because it garbaged
+  // collected its metadata about the task. The task may or may not
+  // still be running.
+  TASK_UNKNOWN = 13;
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/include/mesos/v1/mesos.proto
----------------------------------------------------------------------
diff --git a/include/mesos/v1/mesos.proto b/include/mesos/v1/mesos.proto
index 4a7e998..60ec0cc 100644
--- a/include/mesos/v1/mesos.proto
+++ b/include/mesos/v1/mesos.proto
@@ -297,6 +297,24 @@ message FrameworkInfo {
       // Receive offers with resources that are shared.
       // TODO(anindya_sinha): This is currently a no-op.
       SHARED_RESOURCES = 4;
+
+      // Indicates that the framework is prepared to handle the
+      // following TaskStates: TASK_UNREACHABLE, TASK_DROPPED,
+      // TASK_GONE, TASK_GONE_BY_OPERATOR, and TASK_UNKNOWN.
+      //
+      // With this capability, frameworks can define how they would
+      // like to handle partitioned tasks. Frameworks will receive
+      // TASK_UNREACHABLE for tasks on partitioned agents; if/when the
+      // partitioned agent reregisters, the task will not be killed.
+      // Frameworks that enable this capability will never receive
+      // TASK_LOST; they will receive one of the most specific task
+      // statuses listed above instead.
+      //
+      // Without this capability, frameworks will receive TASK_LOST
+      // for tasks on partitioned agents; such tasks will be killed by
+      // Mesos when the agent reregisters (unless the master has
+      // failed over).
+      PARTITION_AWARE = 5;
     }
 
     // Enum fields should be optional, see: MESOS-4997.
@@ -1488,8 +1506,47 @@ enum TaskState {
   TASK_FINISHED = 2; // TERMINAL: The task finished successfully.
   TASK_FAILED = 3;   // TERMINAL: The task failed to finish successfully.
   TASK_KILLED = 4;   // TERMINAL: The task was killed by the executor.
-  TASK_LOST = 5;     // TERMINAL: The task failed but can be rescheduled.
   TASK_ERROR = 7;    // TERMINAL: The task description contains an error.
+
+  // This is only sent when the framework does NOT opt-in to the
+  // PARTITION_AWARE capability.
+  TASK_LOST = 5;     // TERMINAL: The task failed but can be rescheduled.
+
+  // The following task statuses are only sent when the framework
+  // opts-in to the PARTITION_AWARE capability.
+
+  // The task failed to launch because of a transient error. The
+  // task's executor never started running. Unlike TASK_ERROR, the
+  // task description is valid -- attempting to launch the task again
+  // may be successful. This is a terminal state.
+  TASK_DROPPED = 9;
+
+  // The task was running on an agent that has lost contact with the
+  // master, typically due to a network failure or partition. The task
+  // may or may not still be running.
+  TASK_UNREACHABLE = 10;
+
+  // The task was running on an agent that has been shutdown (e.g.,
+  // the agent become partitioned, rebooted, and then reconnected to
+  // the master; any tasks running before the reboot will transition
+  // from UNREACHABLE to GONE). The task is no longer running. This is
+  // a terminal state.
+  TASK_GONE = 11;
+
+  // The task was running on an agent that the master cannot contact;
+  // the operator has asserted that the agent has been shutdown, but
+  // this has not been directly confirmed by the master. If the
+  // operator is correct, the task is not running and this is a
+  // terminal state; if the operator is mistaken, the task might still
+  // be running, and might return to the RUNNING state in the future.
+  TASK_GONE_BY_OPERATOR = 12;
+
+  // The master has no knowledge of the task. This is typically
+  // because either (a) the master never had knowledge of the task, or
+  // (b) the master forgot about the task because it garbaged
+  // collected its metadata about the task. The task may or may not
+  // still be running.
+  TASK_UNKNOWN = 13;
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/common/protobuf_utils.cpp
----------------------------------------------------------------------
diff --git a/src/common/protobuf_utils.cpp b/src/common/protobuf_utils.cpp
index 8c4a726..ed3ac7f 100644
--- a/src/common/protobuf_utils.cpp
+++ b/src/common/protobuf_utils.cpp
@@ -76,11 +76,17 @@ bool frameworkHasCapability(
 
 bool isTerminalState(const TaskState& state)
 {
+  // TODO(neilc): Revise/rename this function. LOST, UNREACHABLE, and
+  // GONE_BY_OPERATOR are not truly "terminal".
   return (state == TASK_FINISHED ||
           state == TASK_FAILED ||
           state == TASK_KILLED ||
           state == TASK_LOST ||
-          state == TASK_ERROR);
+          state == TASK_ERROR ||
+          state == TASK_UNREACHABLE ||
+          state == TASK_DROPPED ||
+          state == TASK_GONE ||
+          state == TASK_GONE_BY_OPERATOR);
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/examples/disk_full_framework.cpp
----------------------------------------------------------------------
diff --git a/src/examples/disk_full_framework.cpp b/src/examples/disk_full_framework.cpp
index ad304fe..1221f83 100644
--- a/src/examples/disk_full_framework.cpp
+++ b/src/examples/disk_full_framework.cpp
@@ -230,6 +230,10 @@ public:
     case TASK_KILLED:
     case TASK_LOST:
     case TASK_ERROR:
+    case TASK_DROPPED:
+    case TASK_UNREACHABLE:
+    case TASK_GONE:
+    case TASK_GONE_BY_OPERATOR:
       if (flags.run_once) {
         driver->abort();
       }
@@ -241,6 +245,7 @@ public:
     case TASK_RUNNING:
     case TASK_STAGING:
     case TASK_KILLING:
+    case TASK_UNKNOWN:
       break;
     }
   }

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/master/http.cpp
----------------------------------------------------------------------
diff --git a/src/master/http.cpp b/src/master/http.cpp
index c6bdad6..525ef6c 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -2872,7 +2872,12 @@ struct TaskStateSummary
       killed(0),
       failed(0),
       lost(0),
-      error(0) {}
+      error(0),
+      dropped(0),
+      unreachable(0),
+      gone(0),
+      gone_by_operator(0),
+      unknown(0) {}
 
   // Account for the state of the given task.
   void count(const Task& task)
@@ -2887,6 +2892,11 @@ struct TaskStateSummary
       case TASK_FAILED: { ++failed; break; }
       case TASK_LOST: { ++lost; break; }
       case TASK_ERROR: { ++error; break; }
+      case TASK_DROPPED: { ++dropped; break; }
+      case TASK_UNREACHABLE: { ++unreachable; break; }
+      case TASK_GONE: { ++gone; break; }
+      case TASK_GONE_BY_OPERATOR: { ++gone_by_operator; break; }
+      case TASK_UNKNOWN: { ++unknown; break; }
       // No default case allows for a helpful compiler error if we
       // introduce a new state.
     }
@@ -2901,6 +2911,11 @@ struct TaskStateSummary
   size_t failed;
   size_t lost;
   size_t error;
+  size_t dropped;
+  size_t unreachable;
+  size_t gone;
+  size_t gone_by_operator;
+  size_t unknown;
 };
 
 
@@ -3045,6 +3060,7 @@ Future<Response> Master::Http::stateSummary(
               const TaskStateSummary& summary =
                 taskStateSummaries.slave(slave->id);
 
+              // TODO(neilc): Update for new PARTITION_AWARE task statuses.
               writer->field("TASK_STAGING", summary.staging);
               writer->field("TASK_STARTING", summary.starting);
               writer->field("TASK_RUNNING", summary.running);
@@ -3095,6 +3111,7 @@ Future<Response> Master::Http::stateSummary(
               const TaskStateSummary& summary =
                 taskStateSummaries.framework(frameworkId);
 
+              // TODO(neilc): Update for new PARTITION_AWARE task statuses.
               writer->field("TASK_STAGING", summary.staging);
               writer->field("TASK_STARTING", summary.starting);
               writer->field("TASK_RUNNING", summary.running);

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 5c00f33..ae38c1a 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -7094,12 +7094,42 @@ void Master::updateTask(Task* task, const StatusUpdate& update)
     }
 
     switch (status.state()) {
-      case TASK_FINISHED: ++metrics->tasks_finished; break;
-      case TASK_FAILED:   ++metrics->tasks_failed;   break;
-      case TASK_KILLED:   ++metrics->tasks_killed;   break;
-      case TASK_LOST:     ++metrics->tasks_lost;     break;
-      case TASK_ERROR:    ++metrics->tasks_error;    break;
-      default:                                       break;
+      case TASK_FINISHED:
+        ++metrics->tasks_finished;
+        break;
+      case TASK_FAILED:
+        ++metrics->tasks_failed;
+        break;
+      case TASK_KILLED:
+        ++metrics->tasks_killed;
+        break;
+      case TASK_LOST:
+        ++metrics->tasks_lost;
+        break;
+      case TASK_ERROR:
+        ++metrics->tasks_error;
+        break;
+      case TASK_UNREACHABLE:
+        ++metrics->tasks_unreachable;
+        break;
+      case TASK_DROPPED:
+        ++metrics->tasks_dropped;
+        break;
+      case TASK_GONE:
+        ++metrics->tasks_gone;
+        break;
+      case TASK_GONE_BY_OPERATOR:
+        ++metrics->tasks_gone_by_operator;
+        break;
+      case TASK_STARTING:
+      case TASK_STAGING:
+      case TASK_RUNNING:
+      case TASK_KILLING:
+        break;
+      case TASK_UNKNOWN:
+        // Should not happen.
+        LOG(FATAL) << "Unexpected TASK_UNKNOWN for in-memory task";
+        break;
     }
 
     if (status.has_reason()) {

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/master/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.cpp b/src/master/metrics.cpp
index 88a752d..3d3338e 100644
--- a/src/master/metrics.cpp
+++ b/src/master/metrics.cpp
@@ -93,6 +93,14 @@ Metrics::Metrics(const Master& master)
         "master/tasks_lost"),
     tasks_error(
         "master/tasks_error"),
+    tasks_dropped(
+        "master/tasks_dropped"),
+    tasks_unreachable(
+        "master/tasks_unreachable"),
+    tasks_gone(
+        "master/tasks_gone"),
+    tasks_gone_by_operator(
+        "master/tasks_gone_by_operator"),
     dropped_messages(
         "master/dropped_messages"),
     messages_register_framework(
@@ -208,6 +216,10 @@ Metrics::Metrics(const Master& master)
   process::metrics::add(tasks_killed);
   process::metrics::add(tasks_lost);
   process::metrics::add(tasks_error);
+  process::metrics::add(tasks_dropped);
+  process::metrics::add(tasks_unreachable);
+  process::metrics::add(tasks_gone);
+  process::metrics::add(tasks_gone_by_operator);
 
   process::metrics::add(dropped_messages);
 
@@ -345,6 +357,10 @@ Metrics::~Metrics()
   process::metrics::remove(tasks_killed);
   process::metrics::remove(tasks_lost);
   process::metrics::remove(tasks_error);
+  process::metrics::remove(tasks_dropped);
+  process::metrics::remove(tasks_unreachable);
+  process::metrics::remove(tasks_gone);
+  process::metrics::remove(tasks_gone_by_operator);
 
   process::metrics::remove(dropped_messages);
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/master/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.hpp b/src/master/metrics.hpp
index 9d201fc..cfddb4b 100644
--- a/src/master/metrics.hpp
+++ b/src/master/metrics.hpp
@@ -66,6 +66,10 @@ struct Metrics
   process::metrics::Counter tasks_killed;
   process::metrics::Counter tasks_lost;
   process::metrics::Counter tasks_error;
+  process::metrics::Counter tasks_dropped;
+  process::metrics::Counter tasks_unreachable;
+  process::metrics::Counter tasks_gone;
+  process::metrics::Counter tasks_gone_by_operator;
 
   typedef hashmap<TaskStatus::Reason, process::metrics::Counter> Reasons;
   typedef hashmap<TaskStatus::Source, Reasons> SourcesReasons;

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 398164d..4c12615 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -3093,7 +3093,9 @@ TEST_F(MasterTest, StateEndpointFrameworkInfo)
 
   vector<FrameworkInfo::Capability::Type> capabilities = {
     FrameworkInfo::Capability::REVOCABLE_RESOURCES,
-    FrameworkInfo::Capability::TASK_KILLING_STATE
+    FrameworkInfo::Capability::TASK_KILLING_STATE,
+    FrameworkInfo::Capability::GPU_RESOURCES,
+    FrameworkInfo::Capability::PARTITION_AWARE
   };
 
   foreach (FrameworkInfo::Capability::Type capability, capabilities) {


[6/6] mesos git commit: Made a few minor tweaks to comments.

Posted by vi...@apache.org.
Made a few minor tweaks to comments.

Review: https://reviews.apache.org/r/50704/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/8330e99f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/8330e99f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/8330e99f

Branch: refs/heads/master
Commit: 8330e99f6e9842ec665414fc5f497e81ef7ed7b4
Parents: 0b90ccc
Author: Neil Conway <ne...@gmail.com>
Authored: Fri Aug 26 14:48:47 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 26 14:49:49 2016 -0700

----------------------------------------------------------------------
 src/messages/messages.proto | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/8330e99f/src/messages/messages.proto
----------------------------------------------------------------------
diff --git a/src/messages/messages.proto b/src/messages/messages.proto
index 17bb352..7d65be1 100644
--- a/src/messages/messages.proto
+++ b/src/messages/messages.proto
@@ -416,9 +416,12 @@ message RegisterSlaveMessage {
 
 
 /**
- * Registers the agent with the master.
- * This is used when the agent has previously registered and
- * the master changes to a newly elected master.
+ * Reregisters the agent with the master.
+ *
+ * This is used when the agent has previously registered and the agent
+ * has reason to suspect that it should re-establish its connection
+ * (e.g., a new master is elected or the agent hasn't seen a ping from
+ * the master for a long period of time).
  *
  * If registration fails, a `ShutdownMessage` is sent to the agent.
  * Failure conditions are documented inline in Master::reregisterSlave.


[2/6] mesos git commit: Added a list of "unreachable" agents to the registry.

Posted by vi...@apache.org.
Added a list of "unreachable" agents to the registry.

These are agents that have failed health checks.

Review: https://reviews.apache.org/r/50700/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/54059140
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/54059140
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/54059140

Branch: refs/heads/master
Commit: 540591407729ae9eaf81f68cb025b181782c5b99
Parents: c3268ca
Author: Neil Conway <ne...@gmail.com>
Authored: Fri Aug 26 14:48:03 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 26 14:48:03 2016 -0700

----------------------------------------------------------------------
 src/master/registry.proto | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/54059140/src/master/registry.proto
----------------------------------------------------------------------
diff --git a/src/master/registry.proto b/src/master/registry.proto
index 9bf9998..03c896c 100644
--- a/src/master/registry.proto
+++ b/src/master/registry.proto
@@ -41,6 +41,17 @@ message Registry {
     repeated Slave slaves = 1;
   }
 
+  message UnreachableSlave {
+    required SlaveID id = 1;
+
+    // The time when the slave was marked unreachable by the master.
+    required TimeInfo timestamp = 2;
+  }
+
+  message UnreachableSlaves {
+    repeated UnreachableSlave slaves = 1;
+  }
+
   message Machine {
     required MachineInfo info = 1;
   }
@@ -60,9 +71,13 @@ message Registry {
   // Most recent leading master.
   optional Master master = 1;
 
-  // All admitted slaves.
+  // All admitted (healthy) slaves.
   optional Slaves slaves = 2;
 
+  // Slaves that have failed health checks. They may or may not still
+  // be running.
+  optional UnreachableSlaves unreachable = 7;
+
   // Holds a list of machines and some status information about each.
   // See comments in `MachineInfo` for more information.
   optional Machines machines = 3;


[3/6] mesos git commit: Added registrar operations for marking agents (un-)reachable.

Posted by vi...@apache.org.
Added registrar operations for marking agents (un-)reachable.

Review: https://reviews.apache.org/r/50701/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/af496f3a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/af496f3a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/af496f3a

Branch: refs/heads/master
Commit: af496f3a80da9a8e7961fb62f839aacf1658222e
Parents: 5405914
Author: Neil Conway <ne...@gmail.com>
Authored: Fri Aug 26 14:48:07 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 26 14:48:07 2016 -0700

----------------------------------------------------------------------
 src/master/master.hpp         | 118 ++++++++++++++++++++++++++++++++++++-
 src/tests/registrar_tests.cpp |  95 ++++++++++++++++++++---------
 2 files changed, 183 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/af496f3a/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 6decff6..b118293 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -1891,7 +1891,7 @@ private:
 };
 
 
-// Implementation of slave admission Registrar operation.
+// Add a new slave to the list of admitted slaves.
 class AdmitSlave : public Operation
 {
 public:
@@ -1906,7 +1906,7 @@ protected:
       hashset<SlaveID>* slaveIDs,
       bool strict)
   {
-    // Check and see if this slave already exists.
+    // Check and see if this slave is currently admitted.
     if (slaveIDs->contains(info.id())) {
       if (strict) {
         return Error("Agent already admitted");
@@ -1915,6 +1915,9 @@ protected:
       }
     }
 
+    // TODO(neilc): Check if the slave appears in the list of
+    // `unreachable` slaves in the registry?
+
     Registry::Slave* slave = registry->mutable_slaves()->add_slaves();
     slave->mutable_info()->CopyFrom(info);
     slaveIDs->insert(info.id());
@@ -1926,6 +1929,117 @@ private:
 };
 
 
+// Move a slave from the list of admitted slaves to the list of
+// unreachable slaves.
+class MarkSlaveUnreachable : public Operation
+{
+public:
+  explicit MarkSlaveUnreachable(const SlaveInfo& _info) : info(_info) {
+    CHECK(info.has_id()) << "SlaveInfo is missing the 'id' field";
+  }
+
+protected:
+  virtual Try<bool> perform(
+      Registry* registry,
+      hashset<SlaveID>* slaveIDs,
+      bool strict)
+  {
+    // As currently implemented, this should not be possible: the
+    // master will only mark slaves unreachable that are currently
+    // admitted.
+    if (!slaveIDs->contains(info.id())) {
+      return Error("Agent not yet admitted");
+    }
+
+    for (int i = 0; i < registry->slaves().slaves().size(); i++) {
+      const Registry::Slave& slave = registry->slaves().slaves(i);
+
+      if (slave.info().id() == info.id()) {
+        registry->mutable_slaves()->mutable_slaves()->DeleteSubrange(i, 1);
+        slaveIDs->erase(info.id());
+
+        Registry::UnreachableSlave* unreachable =
+          registry->mutable_unreachable()->add_slaves();
+
+        unreachable->mutable_id()->CopyFrom(info.id());
+        unreachable->mutable_timestamp()->CopyFrom(protobuf::getCurrentTime());
+
+        return true; // Mutation.
+      }
+    }
+
+    // Should not happen.
+    return Error("Failed to find agent " + stringify(info.id()));
+  }
+
+private:
+  const SlaveInfo info;
+};
+
+
+// Add a slave back to the list of admitted slaves. The slave will
+// typically be in the "unreachable" list; if so, it is removed from
+// that list. The slave might also be in the "admitted" list already.
+// Finally, the slave might be in neither the "unreachable" or
+// "admitted" lists, if its metadata has been garbage collected from
+// the registry.
+class MarkSlaveReachable : public Operation
+{
+public:
+  explicit MarkSlaveReachable(const SlaveInfo& _info) : info(_info) {
+    CHECK(info.has_id()) << "SlaveInfo is missing the 'id' field";
+  }
+
+protected:
+  virtual Try<bool> perform(
+      Registry* registry,
+      hashset<SlaveID>* slaveIDs,
+      bool strict)
+  {
+    // A slave might try to reregister that appears in the list of
+    // admitted slaves. This can occur when the master fails over:
+    // agents will usually attempt to reregister with the new master
+    // before they are marked unreachable. In this situation, the
+    // registry is already in the correct state, so no changes are
+    // needed.
+    if (slaveIDs->contains(info.id())) {
+      return false; // No mutation.
+    }
+
+    // Check whether the slave is in the unreachable list.
+    // TODO(neilc): Optimize this to avoid linear scan.
+    bool found = false;
+    for (int i = 0; i < registry->unreachable().slaves().size(); i++) {
+      const Registry::UnreachableSlave& slave =
+        registry->unreachable().slaves(i);
+
+      if (slave.id() == info.id()) {
+        registry->mutable_unreachable()->mutable_slaves()->DeleteSubrange(i, 1);
+        found = true;
+        break;
+      }
+    }
+
+    if (!found) {
+      LOG(WARNING) << "Allowing UNKNOWN agent to reregister: " << info;
+    }
+
+    // Add the slave to the admitted list, even if we didn't find it
+    // in the unreachable list. This accounts for when the slave was
+    // unreachable for a long time, was GC'd from the unreachable
+    // list, but then eventually reregistered.
+    Registry::Slave* slave = registry->mutable_slaves()->add_slaves();
+    slave->mutable_info()->CopyFrom(info);
+    slaveIDs->insert(info.id());
+
+    return true; // Mutation.
+  }
+
+private:
+  const SlaveInfo info;
+};
+
+
 // Implementation of slave readmission Registrar operation.
 class ReadmitSlave : public Operation
 {

http://git-wip-us.apache.org/repos/asf/mesos/blob/af496f3a/src/tests/registrar_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/registrar_tests.cpp b/src/tests/registrar_tests.cpp
index 9a71d8f..b04fc92 100644
--- a/src/tests/registrar_tests.cpp
+++ b/src/tests/registrar_tests.cpp
@@ -236,7 +236,9 @@ TEST_P(RegistrarTest, Recover)
   AWAIT_EXPECT_FAILED(
       registrar.apply(Owned<Operation>(new AdmitSlave(slave))));
   AWAIT_EXPECT_FAILED(
-      registrar.apply(Owned<Operation>(new ReadmitSlave(slave))));
+      registrar.apply(Owned<Operation>(new MarkSlaveUnreachable(slave))));
+  AWAIT_EXPECT_FAILED(
+      registrar.apply(Owned<Operation>(new MarkSlaveReachable(slave))));
   AWAIT_EXPECT_FAILED(
       registrar.apply(Owned<Operation>(new RemoveSlave(slave))));
 
@@ -246,8 +248,10 @@ TEST_P(RegistrarTest, Recover)
   // operations to ensure they do not fail.
   Future<bool> admit = registrar.apply(
       Owned<Operation>(new AdmitSlave(slave)));
-  Future<bool> readmit = registrar.apply(
-      Owned<Operation>(new ReadmitSlave(slave)));
+  Future<bool> unreachable = registrar.apply(
+      Owned<Operation>(new MarkSlaveUnreachable(slave)));
+  Future<bool> reachable = registrar.apply(
+      Owned<Operation>(new MarkSlaveReachable(slave)));
   Future<bool> remove = registrar.apply(
       Owned<Operation>(new RemoveSlave(slave)));
 
@@ -255,7 +259,8 @@ TEST_P(RegistrarTest, Recover)
   EXPECT_EQ(master, registry.get().master().info());
 
   AWAIT_TRUE(admit);
-  AWAIT_TRUE(readmit);
+  AWAIT_TRUE(unreachable);
+  AWAIT_TRUE(reachable);
   AWAIT_TRUE(remove);
 }
 
@@ -275,7 +280,7 @@ TEST_P(RegistrarTest, Admit)
 }
 
 
-TEST_P(RegistrarTest, Readmit)
+TEST_P(RegistrarTest, MarkReachable)
 {
   Registrar registrar(flags, state);
   AWAIT_READY(registrar.recover(master));
@@ -296,13 +301,51 @@ TEST_P(RegistrarTest, Readmit)
 
   AWAIT_TRUE(registrar.apply(Owned<Operation>(new AdmitSlave(info1))));
 
-  AWAIT_TRUE(registrar.apply(Owned<Operation>(new ReadmitSlave(info1))));
+  // Attempting to mark a slave as reachable that is already reachable
+  // should not result in an error.
+  AWAIT_TRUE(registrar.apply(Owned<Operation>(new MarkSlaveReachable(info1))));
+  AWAIT_TRUE(registrar.apply(Owned<Operation>(new MarkSlaveReachable(info1))));
 
-  if (flags.registry_strict) {
-    AWAIT_FALSE(registrar.apply(Owned<Operation>(new ReadmitSlave(info2))));
-  } else {
-    AWAIT_TRUE(registrar.apply(Owned<Operation>(new ReadmitSlave(info2))));
-  }
+  // Attempting to mark a slave as reachable that is not in the
+  // unreachable list should not result in error.
+  AWAIT_TRUE(registrar.apply(Owned<Operation>(new MarkSlaveReachable(info2))));
+}
+
+
+TEST_P(RegistrarTest, MarkUnreachable)
+{
+  Registrar registrar(flags, state);
+  AWAIT_READY(registrar.recover(master));
+
+  SlaveInfo info1;
+  info1.set_hostname("localhost");
+
+  SlaveID id1;
+  id1.set_value("1");
+  info1.mutable_id()->CopyFrom(id1);
+
+  SlaveID id2;
+  id2.set_value("2");
+
+  SlaveInfo info2;
+  info2.set_hostname("localhost");
+  info2.mutable_id()->CopyFrom(id2);
+
+  AWAIT_TRUE(registrar.apply(Owned<Operation>(new AdmitSlave(info1))));
+
+  AWAIT_TRUE(
+      registrar.apply(Owned<Operation>(new MarkSlaveUnreachable(info1))));
+
+  AWAIT_TRUE(
+      registrar.apply(Owned<Operation>(new MarkSlaveReachable(info1))));
+
+  AWAIT_TRUE(
+      registrar.apply(Owned<Operation>(new MarkSlaveUnreachable(info1))));
+
+  // If a slave is already unreachable, trying to mark it unreachable
+  // again should fail.
+  AWAIT_FALSE(
+      registrar.apply(Owned<Operation>(new MarkSlaveUnreachable(info1))));
 }
 
 
@@ -977,20 +1020,17 @@ TEST_P(RegistrarTest, UpdateWeights)
 
 TEST_P(RegistrarTest, Bootstrap)
 {
-  // Run 1 readmits a slave that is not present.
+  // Run 1 simulates the reregistration of a slave that is not present
+  // in the registry.
   {
     Registrar registrar(flags, state);
     AWAIT_READY(registrar.recover(master));
 
-    // If not strict, we should be allowed to readmit the slave.
-    if (flags.registry_strict) {
-      AWAIT_FALSE(registrar.apply(Owned<Operation>(new ReadmitSlave(slave))));
-    } else {
-      AWAIT_TRUE(registrar.apply(Owned<Operation>(new ReadmitSlave(slave))));
-    }
+    AWAIT_TRUE(
+        registrar.apply(Owned<Operation>(new MarkSlaveReachable(slave))));
   }
 
-  // Run 2 should see the slave if not strict.
+  // Run 2 should see the slave.
   {
     Registrar registrar(flags, state);
 
@@ -998,12 +1038,8 @@ TEST_P(RegistrarTest, Bootstrap)
 
     AWAIT_READY(registry);
 
-    if (flags.registry_strict) {
-      EXPECT_EQ(0, registry.get().slaves().slaves().size());
-    } else {
-      ASSERT_EQ(1, registry.get().slaves().slaves().size());
-      EXPECT_EQ(slave, registry.get().slaves().slaves(0).info());
-    }
+    ASSERT_EQ(1, registry.get().slaves().slaves().size());
+    EXPECT_EQ(slave, registry.get().slaves().slaves(0).info());
   }
 }
 
@@ -1206,13 +1242,16 @@ TEST_P(Registrar_BENCHMARK_Test, Performance)
   // same as in production).
   std::random_shuffle(infos.begin(), infos.end());
 
-  // Readmit slaves.
+  // Mark slaves reachable again. This simulates the master failing
+  // over, and then the previously registered agents reregistering
+  // with the new master.
   watch.start();
   foreach (const SlaveInfo& info, infos) {
-    result = registrar.apply(Owned<Operation>(new ReadmitSlave(info)));
+    result = registrar.apply(Owned<Operation>(new MarkSlaveReachable(info)));
   }
   AWAIT_READY_FOR(result, Minutes(5));
-  LOG(INFO) << "Readmitted " << slaveCount << " agents in " << watch.elapsed();
+  LOG(INFO) << "Marked " << slaveCount
+            << " agents reachable in " << watch.elapsed();
 
   // Recover slaves.
   Registrar registrar2(flags, state);


[5/6] mesos git commit: Removed a no-longer-relevant test.

Posted by vi...@apache.org.
Removed a no-longer-relevant test.

The behavior this test is trying to validate (slaves receive a
`ShutdownMessage` if they attempt to reregister after failing health
checks) will be changed shortly. Moreover, the new behavior is already
covered by other test cases.

Review: https://reviews.apache.org/r/50703/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/0b90ccca
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/0b90ccca
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/0b90ccca

Branch: refs/heads/master
Commit: 0b90cccaca0069a2e2fff54d1424d205659346a3
Parents: 93016d3
Author: Neil Conway <ne...@gmail.com>
Authored: Fri Aug 26 14:48:39 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 26 14:49:49 2016 -0700

----------------------------------------------------------------------
 src/tests/slave_recovery_tests.cpp | 142 --------------------------------
 1 file changed, 142 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/0b90ccca/src/tests/slave_recovery_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_recovery_tests.cpp b/src/tests/slave_recovery_tests.cpp
index 9ff19f4..3c69e56 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -3186,148 +3186,6 @@ TYPED_TEST(SlaveRecoveryTest, SchedulerFailover)
 }
 
 
-// The purpose of this test is to ensure that during a network
-// partition, the master will remove a partitioned slave. When the
-// partition is removed, the slave will receive a ShutdownMessage.
-// When the slave starts again on the same host, we verify that the
-// slave will not try to reregister itself with the master. It will
-// register itself with the master and get a new slave id.
-TYPED_TEST(SlaveRecoveryTest, PartitionedSlave)
-{
-  master::Flags masterFlags = this->CreateMasterFlags();
-  Try<Owned<cluster::Master>> master = this->StartMaster(masterFlags);
-  ASSERT_SOME(master);
-
-  // Set these expectations up before we spawn the slave so that we
-  // don't miss the first PING.
-  Future<Message> ping = FUTURE_MESSAGE(
-      Eq(PingSlaveMessage().GetTypeName()), _, _);
-
-  // Drop all the PONGs to simulate slave partition.
-  DROP_PROTOBUFS(PongSlaveMessage(), _, _);
-
-  slave::Flags flags = this->CreateSlaveFlags();
-
-  Fetcher fetcher;
-
-  Try<TypeParam*> _containerizer = TypeParam::create(flags, true, &fetcher);
-  ASSERT_SOME(_containerizer);
-  Owned<slave::Containerizer> containerizer(_containerizer.get());
-
-  Owned<MasterDetector> detector = master.get()->createDetector();
-
-  Try<Owned<cluster::Slave>> slave =
-    this->StartSlave(detector.get(), containerizer.get(), flags);
-  ASSERT_SOME(slave);
-
-  // Enable checkpointing for the framework.
-  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
-  frameworkInfo.set_checkpoint(true);
-
-  MockScheduler sched;
-  MesosSchedulerDriver driver(
-      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
-
-  EXPECT_CALL(sched, registered(_, _, _));
-
-  Future<vector<Offer>> offers;
-  EXPECT_CALL(sched, resourceOffers(&driver, _))
-    .WillOnce(FutureArg<1>(&offers))
-    .WillRepeatedly(Return());
-
-  driver.start();
-
-  AWAIT_READY(offers);
-  EXPECT_NE(0u, offers.get().size());
-
-  // Long running task.
-  TaskInfo task = createTask(offers.get()[0], "sleep 1000");
-
-  EXPECT_CALL(sched, statusUpdate(_, _));
-
-  Future<Nothing> _statusUpdateAcknowledgement =
-    FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement);
-
-  driver.launchTasks(offers.get()[0].id(), {task});
-
-  // Wait for the ACK to be checkpointed.
-  AWAIT_READY(_statusUpdateAcknowledgement);
-
-  Future<ShutdownMessage> shutdownMessage =
-    FUTURE_PROTOBUF(ShutdownMessage(), _, slave.get()->pid);
-
-  Future<Nothing> slaveLost;
-  EXPECT_CALL(sched, slaveLost(&driver, _))
-    .WillOnce(FutureSatisfy(&slaveLost));
-
-  Future<TaskStatus> status;
-  EXPECT_CALL(sched, statusUpdate(&driver, _))
-    .WillOnce(FutureArg<1>(&status));
-
-  Future<Nothing> executorTerminated =
-    FUTURE_DISPATCH(_, &Slave::executorTerminated);
-
-  Clock::pause();
-
-  // Now, induce a partition of the slave by having the master
-  // timeout the slave.
-  size_t pings = 0;
-  while (true) {
-    AWAIT_READY(ping);
-    pings++;
-    if (pings == masterFlags.max_agent_ping_timeouts) {
-      break;
-    }
-    ping = FUTURE_MESSAGE(Eq(PingSlaveMessage().GetTypeName()), _, _);
-    Clock::advance(masterFlags.agent_ping_timeout);
-  }
-
-  Clock::advance(masterFlags.agent_ping_timeout);
-
-  // The master will notify the framework that the slave was lost.
-  AWAIT_READY(slaveLost);
-
-  // The master will have notified the framework of the lost task.
-  AWAIT_READY(status);
-
-  EXPECT_EQ(TASK_LOST, status.get().state());
-  EXPECT_EQ(TaskStatus::SOURCE_MASTER, status.get().source());
-  EXPECT_EQ(TaskStatus::REASON_SLAVE_REMOVED, status.get().reason());
-
-  // Wait for the master to attempt to shut down the slave.
-  AWAIT_READY(shutdownMessage);
-
-  // Wait for the executor to be terminated.
-  while (executorTerminated.isPending()) {
-    Clock::advance(process::MAX_REAP_INTERVAL());
-    Clock::settle();
-  }
-
-  AWAIT_READY(executorTerminated);
-  Clock::settle();
-
-  Clock::resume();
-
-  slave.get()->terminate();
-
-  Future<RegisterSlaveMessage> registerSlaveMessage =
-    FUTURE_PROTOBUF(RegisterSlaveMessage(), _, _);
-
-  // Restart the slave (use same flags) with a new isolator.
-  _containerizer = TypeParam::create(flags, true, &fetcher);
-  ASSERT_SOME(_containerizer);
-  containerizer.reset(_containerizer.get());
-
-  slave = this->StartSlave(detector.get(), containerizer.get(), flags);
-  ASSERT_SOME(slave);
-
-  AWAIT_READY(registerSlaveMessage);
-
-  driver.stop();
-  driver.join();
-}
-
-
 // This test verifies that if the master changes when the slave is
 // down, the slave can still recover the task when it restarts. We
 // verify its correctness by killing the task from the scheduler.


[4/6] mesos git commit: Renamed metrics from "slave_shutdowns" to "slave_unreachable".

Posted by vi...@apache.org.
Renamed metrics from "slave_shutdowns" to "slave_unreachable".

The master will shortly be changed to no longer shutdown unhealthy
agents, so the previous metric name is no longer accurate. The old
metric names have been kept for backwards compatibility, but they
are no longer updated (i.e., they will always be set to zero).

Review: https://reviews.apache.org/r/50702/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/93016d37
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/93016d37
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/93016d37

Branch: refs/heads/master
Commit: 93016d37bf8833d7a78ada9c4ec59a374419ba35
Parents: af496f3
Author: Neil Conway <ne...@gmail.com>
Authored: Fri Aug 26 14:48:16 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 26 14:49:43 2016 -0700

----------------------------------------------------------------------
 CHANGELOG                     | 14 ++++++++++++++
 docs/monitoring.md            | 24 +++++++++++++-----------
 src/master/master.cpp         | 13 ++++++-------
 src/master/metrics.cpp        | 16 +++++++++++++++-
 src/master/metrics.hpp        |  7 +++++++
 src/tests/master_tests.cpp    |  4 ++--
 src/tests/partition_tests.cpp |  2 ++
 7 files changed, 59 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index ffeaf10..587d843 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,17 @@
+Release Notes - Mesos - Version 1.1.0 (WIP)
+--------------------------------------------
+This release contains the following new features:
+
+Deprecations:
+
+  * The following metrics are deprecated and will be removed in Mesos 1.4:
+    master/slave_shutdowns_scheduled, master/slave_shutdowns_canceled, and
+    slave_shutdowns_completed. As of Mesos 1.1.0, these metrics will always be
+    zero. The following new metrics have been introduced as replacements:
+    master/slave_unreachable_scheduled, master/slave_unreachable_canceled,
+    and master/slave_unreachable_completed.
+
+
 Release Notes - Mesos - Version 1.0.1
 --------------------------------------------
 * This is a bug fix release.

http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/docs/monitoring.md
----------------------------------------------------------------------
diff --git a/docs/monitoring.md b/docs/monitoring.md
index e19ecd0..f32ee40 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -347,30 +347,32 @@ unhealthy or that they are not able to connect to the elected master.
 </tr>
 <tr>
   <td>
-  <code>master/slave_shutdowns_scheduled</code>
+  <code>master/slave_unreachable_scheduled</code>
   </td>
   <td>Number of agents which have failed their health check and are scheduled
-      to be removed. They will not be immediately removed due to the Agent
-      Removal Rate-Limit, but <code>master/slave_shutdowns_completed</code>
+      to be marked unreachable. They will not be marked unreachable immediately due to the Agent
+      Removal Rate-Limit, but <code>master/slave_unreachable_completed</code>
       will start increasing as they do get removed.</td>
   <td>Counter</td>
 </tr>
 <tr>
   <td>
-  <code>master/slave_shutdowns_canceled</code>
+  <code>master/slave_unreachable_canceled</code>
   </td>
-  <td>Number of cancelled agent shutdowns. This happens when the agent removal
-      rate limit allows for an agent to reconnect and send a <code>PONG</code>
-      to the master before being removed.</td>
+  <td>Number of times that an agent was due to be marked unreachable but this
+      transition was cancelled. This happens when the agent removal rate limit
+      is enabled and the agent sends a <code>PONG</code> response message to the
+      master before the rate limit allows the agent to be marked unreachable.</td>
   <td>Counter</td>
 </tr>
 <tr>
   <td>
-  <code>master/slave_shutdowns_completed</code>
+  <code>master/slave_unreachable_completed</code>
   </td>
-  <td>Number of agents that failed their health check. These are agents which
-      were not heard from despite the agent-removal rate limit, and have been
-      removed from the master's agent registry.</td>
+  <td>Number of agents that were marked as unreachable because they failed
+      health checks. These are agents which were not heard from despite the
+      agent-removal rate limit, and have been marked as unreachable in the
+      master's agent registry.</td>
   <td>Counter</td>
 </tr>
 <tr>

http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index ae38c1a..2b4aff8 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -233,7 +233,7 @@ protected:
     }
 
     shuttingDown = acquire.onAny(defer(self(), &Self::_shutdown));
-    ++metrics->slave_shutdowns_scheduled;
+    ++metrics->slave_unreachable_scheduled;
   }
 
   void _shutdown()
@@ -248,7 +248,7 @@ protected:
       LOG(INFO) << "Shutting down agent " << slaveId
                 << " due to health check timeout";
 
-      ++metrics->slave_shutdowns_completed;
+      ++metrics->slave_unreachable_completed;
 
       dispatch(master,
                &Master::shutdownSlave,
@@ -258,7 +258,7 @@ protected:
       LOG(INFO) << "Canceling shutdown of agent " << slaveId
                 << " since a pong is received!";
 
-      ++metrics->slave_shutdowns_canceled;
+      ++metrics->slave_unreachable_canceled;
     }
 
     shuttingDown = None();
@@ -1724,7 +1724,7 @@ void Master::recoveredSlavesTimeout(const Registry& registry)
       .onFailed(lambda::bind(fail, failure, lambda::_1))
       .onDiscarded(lambda::bind(fail, failure, "discarded"));
 
-    ++metrics->slave_shutdowns_scheduled;
+    ++metrics->slave_unreachable_scheduled;
   }
 }
 
@@ -1737,8 +1737,7 @@ Nothing Master::removeSlave(const Registry::Slave& slave)
               << slave.info().id() << " (" << slave.info().hostname() << ")"
               << " since it re-registered!";
 
-    ++metrics->slave_shutdowns_canceled;
-
+    ++metrics->slave_unreachable_canceled;
     return Nothing();
   }
 
@@ -1747,7 +1746,7 @@ Nothing Master::removeSlave(const Registry::Slave& slave)
                << " within " << flags.agent_reregister_timeout
                << " after master failover; removing it from the registrar";
 
-  ++metrics->slave_shutdowns_completed;
+  ++metrics->slave_unreachable_completed;
   ++metrics->recovery_slave_removals;
 
   slaves.recovered.erase(slave.info().id());

http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/src/master/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.cpp b/src/master/metrics.cpp
index 3d3338e..1f049f3 100644
--- a/src/master/metrics.cpp
+++ b/src/master/metrics.cpp
@@ -189,7 +189,13 @@ Metrics::Metrics(const Master& master)
     slave_shutdowns_completed(
         "master/slave_shutdowns_completed"),
     slave_shutdowns_canceled(
-        "master/slave_shutdowns_canceled")
+        "master/slave_shutdowns_canceled"),
+    slave_unreachable_scheduled(
+        "master/slave_unreachable_scheduled"),
+    slave_unreachable_completed(
+        "master/slave_unreachable_completed"),
+    slave_unreachable_canceled(
+        "master/slave_unreachable_canceled")
 {
   // TODO(dhamon): Check return values of 'add'.
   process::metrics::add(uptime_secs);
@@ -279,6 +285,10 @@ Metrics::Metrics(const Master& master)
   process::metrics::add(slave_shutdowns_completed);
   process::metrics::add(slave_shutdowns_canceled);
 
+  process::metrics::add(slave_unreachable_scheduled);
+  process::metrics::add(slave_unreachable_completed);
+  process::metrics::add(slave_unreachable_canceled);
+
   // Create resource gauges.
   // TODO(dhamon): Set these up dynamically when adding a slave based on the
   // resources the slave exposes.
@@ -420,6 +430,10 @@ Metrics::~Metrics()
   process::metrics::remove(slave_shutdowns_completed);
   process::metrics::remove(slave_shutdowns_canceled);
 
+  process::metrics::remove(slave_unreachable_scheduled);
+  process::metrics::remove(slave_unreachable_completed);
+  process::metrics::remove(slave_unreachable_canceled);
+
   foreach (const Gauge& gauge, resources_total) {
     process::metrics::remove(gauge);
   }

http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/src/master/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.hpp b/src/master/metrics.hpp
index cfddb4b..056d290 100644
--- a/src/master/metrics.hpp
+++ b/src/master/metrics.hpp
@@ -179,10 +179,17 @@ struct Metrics
   process::metrics::Counter slave_removals_reason_registered;
 
   // Slave observer metrics.
+  //
+  // TODO(neilc): The `slave_shutdowns_xxx` metrics are deprecated and
+  // will always be zero. Remove in Mesos 2.0.
   process::metrics::Counter slave_shutdowns_scheduled;
   process::metrics::Counter slave_shutdowns_completed;
   process::metrics::Counter slave_shutdowns_canceled;
 
+  process::metrics::Counter slave_unreachable_scheduled;
+  process::metrics::Counter slave_unreachable_completed;
+  process::metrics::Counter slave_unreachable_canceled;
+
   // Non-revocable resources.
   std::vector<process::metrics::Gauge> resources_total;
   std::vector<process::metrics::Gauge> resources_used;

http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 4c12615..6cde15f 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -1821,8 +1821,8 @@ TEST_F(MasterTest, RecoveredSlaveDoesNotReregister)
   EXPECT_EQ(1, stats.values["master/recovery_slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
-  EXPECT_EQ(1, stats.values["master/slave_shutdowns_completed"]);
-  EXPECT_EQ(1, stats.values["master/slave_shutdowns_scheduled"]);
+  EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
+  EXPECT_EQ(1, stats.values["master/slave_unreachable_scheduled"]);
 
   Clock::resume();
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/src/tests/partition_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/partition_tests.cpp b/src/tests/partition_tests.cpp
index 0a72b34..f3142ad 100644
--- a/src/tests/partition_tests.cpp
+++ b/src/tests/partition_tests.cpp
@@ -148,6 +148,8 @@ TEST_P(PartitionTest, PartitionedSlave)
   slave->reset();
 
   JSON::Object stats = Metrics();
+  EXPECT_EQ(1, stats.values["master/slave_unreachable_scheduled"]);
+  EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
   EXPECT_EQ(1, stats.values["master/slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);