You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by zh...@apache.org on 2018/03/14 19:55:30 UTC

[1/3] mesos git commit: Document new `slave/recovery_time_secs` gauge.

Repository: mesos
Updated Branches:
  refs/heads/master 768d6fc7e -> 82c50c0e0


Document new `slave/recovery_time_secs` gauge.

Review: https://reviews.apache.org/r/66070


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/82c50c0e
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/82c50c0e
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/82c50c0e

Branch: refs/heads/master
Commit: 82c50c0e00284c131354499f74176b19d89bd21d
Parents: b8526c6
Author: Zhitao Li <zh...@gmail.com>
Authored: Wed Mar 14 09:25:01 2018 -0700
Committer: Zhitao Li <zh...@gmail.com>
Committed: Wed Mar 14 12:54:54 2018 -0700

----------------------------------------------------------------------
 docs/monitoring.md | 8 ++++++++
 1 file changed, 8 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/82c50c0e/docs/monitoring.md
----------------------------------------------------------------------
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 0fba5ce..5c71cc9 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -1629,6 +1629,14 @@ on the agent.
   <td>Number of errors encountered during agent recovery</td>
   <td>Gauge</td>
 </tr>
+<tr>
+  <td>
+  <code>slave/recovery_time_secs</code>
+  </td>
+  <td>Agent recovery time in seconds. This value is only available after agent
+  recovery succeeded and remains constant for the life of the Mesos agent.</td>
+  <td>Gauge</td>
+</tr>
 </table>
 
 #### Tasks


[3/3] mesos git commit: Added a gauge for how long agent recovery takes.

Posted by zh...@apache.org.
Added a gauge for how long agent recovery takes.

The new metric `slave/recover_time_secs` can be used to tell us how long
Mesos agent needed to finish its recovery cycle. This is an important
metric on agent machines which have a lot of completed executor
sandboxes.

Note that the metric 1) will only be available after recovery succeeded
and 2) never change its value across agent process lifecycle afterwards.

Review: https://reviews.apache.org/r/65954


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/026dafd3
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/026dafd3
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/026dafd3

Branch: refs/heads/master
Commit: 026dafd33cd23d41818e18e31ec271fa2c13abd2
Parents: 768d6fc
Author: Zhitao Li <zh...@gmail.com>
Authored: Tue Mar 6 17:43:48 2018 -0800
Committer: Zhitao Li <zh...@gmail.com>
Committed: Wed Mar 14 12:54:54 2018 -0700

----------------------------------------------------------------------
 src/slave/metrics.cpp | 18 ++++++++++++++++++
 src/slave/metrics.hpp |  3 +++
 src/slave/slave.cpp   |  2 ++
 3 files changed, 23 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/026dafd3/src/slave/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/slave/metrics.cpp b/src/slave/metrics.cpp
index 0eb2b59..44294af 100644
--- a/src/slave/metrics.cpp
+++ b/src/slave/metrics.cpp
@@ -243,6 +243,24 @@ Metrics::~Metrics()
     process::metrics::remove(gauge);
   }
   resources_revocable_percent.clear();
+
+  if (recovery_time_secs.isSome()) {
+    process::metrics::remove(recovery_time_secs.get());
+  }
+}
+
+
+void Metrics::setRecoveryTime(const Duration& duration)
+{
+  CHECK_NONE(recovery_time_secs);
+
+  const double recovery_seconds = duration.secs();
+
+  recovery_time_secs = process::metrics::Gauge(
+        "slave/recovery_time_secs",
+        [recovery_seconds]() { return recovery_seconds;});
+
+  process::metrics::add(recovery_time_secs.get());
 }
 
 } // namespace slave {

http://git-wip-us.apache.org/repos/asf/mesos/blob/026dafd3/src/slave/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/slave/metrics.hpp b/src/slave/metrics.hpp
index 3fc933c..b771c4b 100644
--- a/src/slave/metrics.hpp
+++ b/src/slave/metrics.hpp
@@ -35,10 +35,13 @@ struct Metrics
 
   ~Metrics();
 
+  void setRecoveryTime(const Duration& duration);
+
   process::metrics::Gauge uptime_secs;
   process::metrics::Gauge registered;
 
   process::metrics::Counter recovery_errors;
+  Option<process::metrics::Gauge> recovery_time_secs;
 
   process::metrics::Gauge frameworks_active;
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/026dafd3/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 4112163..0962ea7 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -7304,6 +7304,8 @@ void Slave::__recover(const Future<Nothing>& future)
   }
 
   recoveryInfo.recovered.set(Nothing()); // Signal recovery.
+
+  metrics.setRecoveryTime(process::Clock::now() - startTime);
 }
 
 


[2/3] mesos git commit: Added a test to make sure `slave/recovery_time_secs` is reported.

Posted by zh...@apache.org.
Added a test to make sure `slave/recovery_time_secs` is reported.

Review: https://reviews.apache.org/r/65959


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/b8526c61
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/b8526c61
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/b8526c61

Branch: refs/heads/master
Commit: b8526c61403214aaa67fa941b4e8b0fd8e3328f2
Parents: 026dafd
Author: Zhitao Li <zh...@gmail.com>
Authored: Wed Mar 7 15:18:53 2018 -0800
Committer: Zhitao Li <zh...@gmail.com>
Committed: Wed Mar 14 12:54:54 2018 -0700

----------------------------------------------------------------------
 src/tests/slave_tests.cpp | 7 +++++++
 1 file changed, 7 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/b8526c61/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index c2afaa5..f76500e 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -1374,12 +1374,19 @@ TEST_F(SlaveTest, MetricsInMetricsEndpoint)
   Try<Owned<cluster::Slave>> slave = StartSlave(detector.get());
   ASSERT_SOME(slave);
 
+  // Make sure slave finishes recovery.
+  Future<RegisterSlaveMessage> registerSlave = FUTURE_PROTOBUF(
+      RegisterSlaveMessage(), slave.get()->pid, master.get()->pid);
+
+  AWAIT_READY(registerSlave);
+
   JSON::Object snapshot = Metrics();
 
   EXPECT_EQ(1u, snapshot.values.count("slave/uptime_secs"));
   EXPECT_EQ(1u, snapshot.values.count("slave/registered"));
 
   EXPECT_EQ(1u, snapshot.values.count("slave/recovery_errors"));
+  EXPECT_EQ(1u, snapshot.values.count("slave/recovery_time_secs"));
 
   EXPECT_EQ(1u, snapshot.values.count("slave/frameworks_active"));