You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by zh...@apache.org on 2018/03/14 19:55:32 UTC

[3/3] mesos git commit: Added a gauge for how long agent recovery takes.

Added a gauge for how long agent recovery takes.

The new metric `slave/recover_time_secs` can be used to tell us how long
Mesos agent needed to finish its recovery cycle. This is an important
metric on agent machines which have a lot of completed executor
sandboxes.

Note that the metric 1) will only be available after recovery succeeded
and 2) never change its value across agent process lifecycle afterwards.

Review: https://reviews.apache.org/r/65954


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/026dafd3
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/026dafd3
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/026dafd3

Branch: refs/heads/master
Commit: 026dafd33cd23d41818e18e31ec271fa2c13abd2
Parents: 768d6fc
Author: Zhitao Li <zh...@gmail.com>
Authored: Tue Mar 6 17:43:48 2018 -0800
Committer: Zhitao Li <zh...@gmail.com>
Committed: Wed Mar 14 12:54:54 2018 -0700

----------------------------------------------------------------------
 src/slave/metrics.cpp | 18 ++++++++++++++++++
 src/slave/metrics.hpp |  3 +++
 src/slave/slave.cpp   |  2 ++
 3 files changed, 23 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/026dafd3/src/slave/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/slave/metrics.cpp b/src/slave/metrics.cpp
index 0eb2b59..44294af 100644
--- a/src/slave/metrics.cpp
+++ b/src/slave/metrics.cpp
@@ -243,6 +243,24 @@ Metrics::~Metrics()
     process::metrics::remove(gauge);
   }
   resources_revocable_percent.clear();
+
+  if (recovery_time_secs.isSome()) {
+    process::metrics::remove(recovery_time_secs.get());
+  }
+}
+
+
+void Metrics::setRecoveryTime(const Duration& duration)
+{
+  CHECK_NONE(recovery_time_secs);
+
+  const double recovery_seconds = duration.secs();
+
+  recovery_time_secs = process::metrics::Gauge(
+        "slave/recovery_time_secs",
+        [recovery_seconds]() { return recovery_seconds;});
+
+  process::metrics::add(recovery_time_secs.get());
 }
 
 } // namespace slave {

http://git-wip-us.apache.org/repos/asf/mesos/blob/026dafd3/src/slave/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/slave/metrics.hpp b/src/slave/metrics.hpp
index 3fc933c..b771c4b 100644
--- a/src/slave/metrics.hpp
+++ b/src/slave/metrics.hpp
@@ -35,10 +35,13 @@ struct Metrics
 
   ~Metrics();
 
+  void setRecoveryTime(const Duration& duration);
+
   process::metrics::Gauge uptime_secs;
   process::metrics::Gauge registered;
 
   process::metrics::Counter recovery_errors;
+  Option<process::metrics::Gauge> recovery_time_secs;
 
   process::metrics::Gauge frameworks_active;
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/026dafd3/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 4112163..0962ea7 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -7304,6 +7304,8 @@ void Slave::__recover(const Future<Nothing>& future)
   }
 
   recoveryInfo.recovered.set(Nothing()); // Signal recovery.
+
+  metrics.setRecoveryTime(process::Clock::now() - startTime);
 }