You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2016/04/12 21:23:35 UTC

[2/5] mesos git commit: Added some metrics to the long-lived-framework example.

Added some metrics to the long-lived-framework example.

Adds metrics to gauge the health of the framework.  This includes:

* uptime_secs = How long the framework has been running.
* registered = If the framework is registered.
* offers_received = A counter used to determine if the framework is
  starved or not.
* tasks_launched = Number of tasks launched.
* abnormal_terminations = Number of terminal status updates which
  were not `TASK_FINISHED`.

Also adds an endpoint `/framework/counters` which returns the list of
metrics which are "counters".

Review: https://reviews.apache.org/r/45440/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/c82c12ae
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/c82c12ae
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/c82c12ae

Branch: refs/heads/master
Commit: c82c12aecea0e44214e0a9ac8ad02e33aa197274
Parents: debf0ac
Author: Joseph Wu <jo...@mesosphere.io>
Authored: Tue Apr 12 12:23:13 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Tue Apr 12 12:23:13 2016 -0700

----------------------------------------------------------------------
 src/examples/long_lived_framework.cpp | 136 ++++++++++++++++++++++++++++-
 1 file changed, 134 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/c82c12ae/src/examples/long_lived_framework.cpp
----------------------------------------------------------------------
diff --git a/src/examples/long_lived_framework.cpp b/src/examples/long_lived_framework.cpp
index 29ee6f2..035ddda 100644
--- a/src/examples/long_lived_framework.cpp
+++ b/src/examples/long_lived_framework.cpp
@@ -22,6 +22,18 @@
 #include <mesos/resources.hpp>
 #include <mesos/scheduler.hpp>
 
+#include <process/clock.hpp>
+#include <process/defer.hpp>
+#include <process/help.hpp>
+#include <process/http.hpp>
+#include <process/process.hpp>
+#include <process/protobuf.hpp>
+#include <process/time.hpp>
+
+#include <process/metrics/counter.hpp>
+#include <process/metrics/gauge.hpp>
+#include <process/metrics/metrics.hpp>
+
 #include <stout/flags.hpp>
 #include <stout/foreach.hpp>
 #include <stout/option.hpp>
@@ -34,6 +46,18 @@ using namespace mesos;
 using std::string;
 using std::vector;
 
+using process::AUTHENTICATION;
+using process::Clock;
+using process::defer;
+using process::DESCRIPTION;
+using process::HELP;
+using process::TLDR;
+
+using process::http::OK;
+
+using process::metrics::Gauge;
+using process::metrics::Counter;
+
 
 // NOTE: Per-task resources are nominal because all of the resources for the
 // container are provisioned when the executor is created. The executor can
@@ -57,25 +81,39 @@ public:
       taskResources(Resources::parse(
           "cpus:" + stringify(CPUS_PER_TASK) +
           ";mem:" + stringify(MEM_PER_TASK)).get()),
-      tasksLaunched(0) {}
+      tasksLaunched(0),
+      metrics(*this)
+  {
+    process::spawn(metrics);
+  }
 
-  virtual ~LongLivedScheduler() {}
+  virtual ~LongLivedScheduler()
+  {
+    process::terminate(metrics);
+    process::wait(metrics);
+  }
 
   virtual void registered(SchedulerDriver*,
                           const FrameworkID&,
                           const MasterInfo&)
   {
     LOG(INFO) << "Registered!";
+
+    metrics.isRegistered = true;
   }
 
   virtual void reregistered(SchedulerDriver*, const MasterInfo& masterInfo)
   {
     LOG(INFO) << "Re-registered!";
+
+    metrics.isRegistered = true;
   }
 
   virtual void disconnected(SchedulerDriver* driver)
   {
     LOG(INFO) << "Disconnected!";
+
+    metrics.isRegistered = false;
   }
 
   virtual void resourceOffers(SchedulerDriver* driver,
@@ -83,6 +121,8 @@ public:
   {
     static const Resources EXECUTOR_RESOURCES = Resources(executor.resources());
 
+    metrics.offers_received += offers.size();
+
     foreach (const Offer& offer, offers) {
       if (slaveId.isNone()) {
         // No active executor running in the cluster.
@@ -129,6 +169,13 @@ public:
       << "Task " << status.task_id().value()
       << " is in state " << TaskState_Name(status.state())
       << (status.has_message() ? " with message: " + status.message() : "");
+
+    if (status.state() == TASK_KILLED ||
+        status.state() == TASK_LOST ||
+        status.state() == TASK_FAILED ||
+        status.state() == TASK_ERROR) {
+      ++metrics.abnormal_terminations;
+    }
   }
 
   virtual void frameworkMessage(SchedulerDriver* driver,
@@ -193,6 +240,91 @@ private:
   // Unless that slave/executor dies, this framework will not launch
   // an executor on any other slave.
   Option<SlaveID> slaveId;
+
+  struct Metrics : process::Process<Metrics>
+  {
+    Metrics(const LongLivedScheduler& _scheduler)
+      : ProcessBase("framework"),
+        scheduler(_scheduler),
+        isRegistered(false),
+        uptime_secs(
+            "long_lived_framework/uptime_secs",
+            defer(this, &Self::_uptime_secs)),
+        registered(
+            "long_lived_framework/registered",
+            defer(this, &Self::_registered)),
+        offers_received("long_lived_framework/offers_received"),
+        tasks_launched(
+            "long_lived_framework/tasks_launched",
+            defer(this, &Self::_tasksLaunched)),
+        abnormal_terminations("long_lived_framework/abnormal_terminations")
+    {
+      start_time = Clock::now();
+
+      process::metrics::add(uptime_secs);
+      process::metrics::add(registered);
+      process::metrics::add(offers_received);
+      process::metrics::add(tasks_launched);
+      process::metrics::add(abnormal_terminations);
+    }
+
+    virtual void initialize()
+    {
+      // Special route for metric metadata.
+      route(
+          "/counters",
+          HELP(
+              TLDR("List of counter-type metrics."),
+              DESCRIPTION("Returns 200 OK iff the request is accepted."),
+              AUTHENTICATION(false)),
+          [this](const process::http::Request& request) {
+            JSON::Array array;
+            array.values.push_back("long_lived_framework/offers_received");
+            array.values.push_back(
+                "long_lived_framework/abnormal_terminations");
+
+            return OK(array, request.url.query.get("jsonp"));
+          });
+    }
+
+    ~Metrics()
+    {
+      process::metrics::remove(uptime_secs);
+      process::metrics::remove(registered);
+      process::metrics::remove(offers_received);
+      process::metrics::remove(tasks_launched);
+      process::metrics::remove(abnormal_terminations);
+    }
+
+    const LongLivedScheduler& scheduler;
+
+    process::Time start_time;
+    double _uptime_secs()
+    {
+      return (Clock::now() - start_time).secs();
+    }
+
+    bool isRegistered;
+    double _registered()
+    {
+      return isRegistered ? 1 : 0;
+    }
+
+    double _tasksLaunched()
+    {
+      return scheduler.tasksLaunched;
+    }
+
+    process::metrics::Gauge uptime_secs;
+    process::metrics::Gauge registered;
+
+    process::metrics::Counter offers_received;
+    process::metrics::Gauge tasks_launched;
+
+    // The only expected terminal state is TASK_FINISHED.
+    // Other terminal states are considered incorrect.
+    process::metrics::Counter abnormal_terminations;
+  } metrics;
 };