You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2016/08/26 21:52:51 UTC

[4/6] mesos git commit: Renamed metrics from "slave_shutdowns" to "slave_unreachable".

Renamed metrics from "slave_shutdowns" to "slave_unreachable".

The master will shortly be changed to no longer shutdown unhealthy
agents, so the previous metric name is no longer accurate. The old
metric names have been kept for backwards compatibility, but they
are no longer updated (i.e., they will always be set to zero).

Review: https://reviews.apache.org/r/50702/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/93016d37
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/93016d37
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/93016d37

Branch: refs/heads/master
Commit: 93016d37bf8833d7a78ada9c4ec59a374419ba35
Parents: af496f3
Author: Neil Conway <ne...@gmail.com>
Authored: Fri Aug 26 14:48:16 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 26 14:49:43 2016 -0700

----------------------------------------------------------------------
 CHANGELOG                     | 14 ++++++++++++++
 docs/monitoring.md            | 24 +++++++++++++-----------
 src/master/master.cpp         | 13 ++++++-------
 src/master/metrics.cpp        | 16 +++++++++++++++-
 src/master/metrics.hpp        |  7 +++++++
 src/tests/master_tests.cpp    |  4 ++--
 src/tests/partition_tests.cpp |  2 ++
 7 files changed, 59 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index ffeaf10..587d843 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,17 @@
+Release Notes - Mesos - Version 1.1.0 (WIP)
+--------------------------------------------
+This release contains the following new features:
+
+Deprecations:
+
+  * The following metrics are deprecated and will be removed in Mesos 1.4:
+    master/slave_shutdowns_scheduled, master/slave_shutdowns_canceled, and
+    slave_shutdowns_completed. As of Mesos 1.1.0, these metrics will always be
+    zero. The following new metrics have been introduced as replacements:
+    master/slave_unreachable_scheduled, master/slave_unreachable_canceled,
+    and master/slave_unreachable_completed.
+
+
 Release Notes - Mesos - Version 1.0.1
 --------------------------------------------
 * This is a bug fix release.

http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/docs/monitoring.md
----------------------------------------------------------------------
diff --git a/docs/monitoring.md b/docs/monitoring.md
index e19ecd0..f32ee40 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -347,30 +347,32 @@ unhealthy or that they are not able to connect to the elected master.
 </tr>
 <tr>
   <td>
-  <code>master/slave_shutdowns_scheduled</code>
+  <code>master/slave_unreachable_scheduled</code>
   </td>
   <td>Number of agents which have failed their health check and are scheduled
-      to be removed. They will not be immediately removed due to the Agent
-      Removal Rate-Limit, but <code>master/slave_shutdowns_completed</code>
+      to be marked unreachable. They will not be marked unreachable immediately due to the Agent
+      Removal Rate-Limit, but <code>master/slave_unreachable_completed</code>
       will start increasing as they do get removed.</td>
   <td>Counter</td>
 </tr>
 <tr>
   <td>
-  <code>master/slave_shutdowns_canceled</code>
+  <code>master/slave_unreachable_canceled</code>
   </td>
-  <td>Number of cancelled agent shutdowns. This happens when the agent removal
-      rate limit allows for an agent to reconnect and send a <code>PONG</code>
-      to the master before being removed.</td>
+  <td>Number of times that an agent was due to be marked unreachable but this
+      transition was cancelled. This happens when the agent removal rate limit
+      is enabled and the agent sends a <code>PONG</code> response message to the
+      master before the rate limit allows the agent to be marked unreachable.</td>
   <td>Counter</td>
 </tr>
 <tr>
   <td>
-  <code>master/slave_shutdowns_completed</code>
+  <code>master/slave_unreachable_completed</code>
   </td>
-  <td>Number of agents that failed their health check. These are agents which
-      were not heard from despite the agent-removal rate limit, and have been
-      removed from the master's agent registry.</td>
+  <td>Number of agents that were marked as unreachable because they failed
+      health checks. These are agents which were not heard from despite the
+      agent-removal rate limit, and have been marked as unreachable in the
+      master's agent registry.</td>
   <td>Counter</td>
 </tr>
 <tr>

http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index ae38c1a..2b4aff8 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -233,7 +233,7 @@ protected:
     }
 
     shuttingDown = acquire.onAny(defer(self(), &Self::_shutdown));
-    ++metrics->slave_shutdowns_scheduled;
+    ++metrics->slave_unreachable_scheduled;
   }
 
   void _shutdown()
@@ -248,7 +248,7 @@ protected:
       LOG(INFO) << "Shutting down agent " << slaveId
                 << " due to health check timeout";
 
-      ++metrics->slave_shutdowns_completed;
+      ++metrics->slave_unreachable_completed;
 
       dispatch(master,
                &Master::shutdownSlave,
@@ -258,7 +258,7 @@ protected:
       LOG(INFO) << "Canceling shutdown of agent " << slaveId
                 << " since a pong is received!";
 
-      ++metrics->slave_shutdowns_canceled;
+      ++metrics->slave_unreachable_canceled;
     }
 
     shuttingDown = None();
@@ -1724,7 +1724,7 @@ void Master::recoveredSlavesTimeout(const Registry& registry)
       .onFailed(lambda::bind(fail, failure, lambda::_1))
       .onDiscarded(lambda::bind(fail, failure, "discarded"));
 
-    ++metrics->slave_shutdowns_scheduled;
+    ++metrics->slave_unreachable_scheduled;
   }
 }
 
@@ -1737,8 +1737,7 @@ Nothing Master::removeSlave(const Registry::Slave& slave)
               << slave.info().id() << " (" << slave.info().hostname() << ")"
               << " since it re-registered!";
 
-    ++metrics->slave_shutdowns_canceled;
-
+    ++metrics->slave_unreachable_canceled;
     return Nothing();
   }
 
@@ -1747,7 +1746,7 @@ Nothing Master::removeSlave(const Registry::Slave& slave)
                << " within " << flags.agent_reregister_timeout
                << " after master failover; removing it from the registrar";
 
-  ++metrics->slave_shutdowns_completed;
+  ++metrics->slave_unreachable_completed;
   ++metrics->recovery_slave_removals;
 
   slaves.recovered.erase(slave.info().id());

http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/src/master/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.cpp b/src/master/metrics.cpp
index 3d3338e..1f049f3 100644
--- a/src/master/metrics.cpp
+++ b/src/master/metrics.cpp
@@ -189,7 +189,13 @@ Metrics::Metrics(const Master& master)
     slave_shutdowns_completed(
         "master/slave_shutdowns_completed"),
     slave_shutdowns_canceled(
-        "master/slave_shutdowns_canceled")
+        "master/slave_shutdowns_canceled"),
+    slave_unreachable_scheduled(
+        "master/slave_unreachable_scheduled"),
+    slave_unreachable_completed(
+        "master/slave_unreachable_completed"),
+    slave_unreachable_canceled(
+        "master/slave_unreachable_canceled")
 {
   // TODO(dhamon): Check return values of 'add'.
   process::metrics::add(uptime_secs);
@@ -279,6 +285,10 @@ Metrics::Metrics(const Master& master)
   process::metrics::add(slave_shutdowns_completed);
   process::metrics::add(slave_shutdowns_canceled);
 
+  process::metrics::add(slave_unreachable_scheduled);
+  process::metrics::add(slave_unreachable_completed);
+  process::metrics::add(slave_unreachable_canceled);
+
   // Create resource gauges.
   // TODO(dhamon): Set these up dynamically when adding a slave based on the
   // resources the slave exposes.
@@ -420,6 +430,10 @@ Metrics::~Metrics()
   process::metrics::remove(slave_shutdowns_completed);
   process::metrics::remove(slave_shutdowns_canceled);
 
+  process::metrics::remove(slave_unreachable_scheduled);
+  process::metrics::remove(slave_unreachable_completed);
+  process::metrics::remove(slave_unreachable_canceled);
+
   foreach (const Gauge& gauge, resources_total) {
     process::metrics::remove(gauge);
   }

http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/src/master/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.hpp b/src/master/metrics.hpp
index cfddb4b..056d290 100644
--- a/src/master/metrics.hpp
+++ b/src/master/metrics.hpp
@@ -179,10 +179,17 @@ struct Metrics
   process::metrics::Counter slave_removals_reason_registered;
 
   // Slave observer metrics.
+  //
+  // TODO(neilc): The `slave_shutdowns_xxx` metrics are deprecated and
+  // will always be zero. Remove in Mesos 2.0.
   process::metrics::Counter slave_shutdowns_scheduled;
   process::metrics::Counter slave_shutdowns_completed;
   process::metrics::Counter slave_shutdowns_canceled;
 
+  process::metrics::Counter slave_unreachable_scheduled;
+  process::metrics::Counter slave_unreachable_completed;
+  process::metrics::Counter slave_unreachable_canceled;
+
   // Non-revocable resources.
   std::vector<process::metrics::Gauge> resources_total;
   std::vector<process::metrics::Gauge> resources_used;

http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 4c12615..6cde15f 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -1821,8 +1821,8 @@ TEST_F(MasterTest, RecoveredSlaveDoesNotReregister)
   EXPECT_EQ(1, stats.values["master/recovery_slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
-  EXPECT_EQ(1, stats.values["master/slave_shutdowns_completed"]);
-  EXPECT_EQ(1, stats.values["master/slave_shutdowns_scheduled"]);
+  EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
+  EXPECT_EQ(1, stats.values["master/slave_unreachable_scheduled"]);
 
   Clock::resume();
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/93016d37/src/tests/partition_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/partition_tests.cpp b/src/tests/partition_tests.cpp
index 0a72b34..f3142ad 100644
--- a/src/tests/partition_tests.cpp
+++ b/src/tests/partition_tests.cpp
@@ -148,6 +148,8 @@ TEST_P(PartitionTest, PartitionedSlave)
   slave->reset();
 
   JSON::Object stats = Metrics();
+  EXPECT_EQ(1, stats.values["master/slave_unreachable_scheduled"]);
+  EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
   EXPECT_EQ(1, stats.values["master/slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);