You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by dm...@apache.org on 2015/02/11 00:22:12 UTC

mesos git commit: Update Master metrics to match task source and reason scheme.

Repository: mesos
Updated Branches:
  refs/heads/master 6936a6d76 -> 4ec71437e


Update Master metrics to match task source and reason scheme.

Review: https://reviews.apache.org/r/27531/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/4ec71437
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/4ec71437
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/4ec71437

Branch: refs/heads/master
Commit: 4ec71437e890680a31b33ac8ce964657e9963837
Parents: 6936a6d
Author: Dominic Hamon <dh...@twitter.com>
Authored: Thu Oct 16 11:47:43 2014 -0700
Committer: Dominic Hamon <dh...@twitter.com>
Committed: Tue Feb 10 15:12:06 2015 -0800

----------------------------------------------------------------------
 src/master/master.cpp                           | 39 +++++++++++++++++---
 src/master/master.hpp                           |  2 +
 src/master/metrics.cpp                          | 36 +++++++++++++++++-
 src/master/metrics.hpp                          | 11 ++++++
 src/slave/slave.cpp                             | 14 ++++---
 src/slave/slave.hpp                             |  2 +-
 src/tests/master_authorization_tests.cpp        | 20 ++++++++++
 src/tests/master_slave_reconciliation_tests.cpp | 12 ++++++
 src/tests/master_tests.cpp                      | 28 ++++++++++++++
 src/tests/mesos.cpp                             | 19 ++++++++++
 src/tests/mesos.hpp                             |  3 ++
 src/tests/rate_limiting_tests.cpp               |  1 +
 src/tests/slave_tests.cpp                       |  2 +
 13 files changed, 176 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 6affd24..dd594e8 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -2249,6 +2249,11 @@ void Master::accept(
         metrics->tasks_lost++;
         stats.tasks[TASK_LOST]++;
 
+        metrics->incrementTasksStates(
+            TASK_LOST,
+            TaskStatus::SOURCE_MASTER,
+            TaskStatus::REASON_INVALID_OFFERS);
+
         forward(update, UPID(), framework);
       }
     }
@@ -2339,6 +2344,9 @@ void Master::_accept(
       }
 
       foreach (const TaskInfo& task, operation.launch().task_infos()) {
+        const TaskStatus::Reason reason =
+            slave == NULL ? TaskStatus::REASON_SLAVE_REMOVED
+                          : TaskStatus::REASON_SLAVE_DISCONNECTED;
         const StatusUpdate& update = protobuf::createStatusUpdate(
             framework->id,
             task.slave_id(),
@@ -2346,13 +2354,16 @@ void Master::_accept(
             TASK_LOST,
             TaskStatus::SOURCE_MASTER,
             slave == NULL ? "Slave removed" : "Slave disconnected",
-            slave == NULL ?
-                TaskStatus::REASON_SLAVE_REMOVED :
-                TaskStatus::REASON_SLAVE_DISCONNECTED);
+            reason);
 
         metrics->tasks_lost++;
         stats.tasks[TASK_LOST]++;
 
+        metrics->incrementTasksStates(
+            TASK_LOST,
+            TaskStatus::SOURCE_MASTER,
+            reason);
+
         forward(update, UPID(), framework);
       }
     }
@@ -2484,6 +2495,11 @@ void Master::_accept(
             metrics->tasks_error++;
             stats.tasks[TASK_ERROR]++;
 
+            metrics->incrementTasksStates(
+                TASK_ERROR,
+                TaskStatus::SOURCE_MASTER,
+                TaskStatus::REASON_TASK_UNAUTHORIZED);
+
             forward(update, UPID(), framework);
 
             continue;
@@ -2509,6 +2525,11 @@ void Master::_accept(
             metrics->tasks_error++;
             stats.tasks[TASK_ERROR]++;
 
+            metrics->incrementTasksStates(
+                TASK_ERROR,
+                TaskStatus::SOURCE_MASTER,
+                TaskStatus::REASON_TASK_INVALID);
+
             forward(update, UPID(), framework);
 
             continue;
@@ -4619,12 +4640,20 @@ void Master::updateTask(Task* task, const StatusUpdate& update)
       framework->taskTerminated(task);
     }
 
-    switch (task->state()) {
+    switch (status.state()) {
       case TASK_FINISHED: ++metrics->tasks_finished; break;
       case TASK_FAILED:   ++metrics->tasks_failed;   break;
       case TASK_KILLED:   ++metrics->tasks_killed;   break;
       case TASK_LOST:     ++metrics->tasks_lost;     break;
-      default: break;
+      case TASK_ERROR:    ++metrics->tasks_error;    break;
+      default:                                       break;
+    }
+
+    if (status.has_reason()) {
+      metrics->incrementTasksStates(
+          status.state(),
+          status.source(),
+          status.reason());
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 9433b0c..6a39df0 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -673,6 +673,8 @@ private:
 
   // NOTE: It is safe to use a 'shared_ptr' because 'Metrics' is
   // thread safe.
+  // TODO(dhamon): Does this need to be a shared_ptr? Metrics contains copyable
+  // metric types only.
   memory::shared_ptr<Metrics> metrics;
 
   // Gauge handlers.

http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/master/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.cpp b/src/master/metrics.cpp
index 0b3b91e..3ed84e2 100644
--- a/src/master/metrics.cpp
+++ b/src/master/metrics.cpp
@@ -21,7 +21,6 @@
 #include "master/master.hpp"
 #include "master/metrics.hpp"
 
-
 namespace mesos {
 namespace master {
 
@@ -342,6 +341,41 @@ Metrics::~Metrics()
     process::metrics::remove(gauge);
   }
   resources_percent.clear();
+
+  foreachvalue (const auto& source_reason, tasks_states) {
+    foreachvalue (const auto& reason_counter, source_reason) {
+      foreachvalue (const process::metrics::Counter& counter, reason_counter) {
+        process::metrics::remove(counter);
+      }
+    }
+  }
+  tasks_states.clear();
+}
+
+
+void Metrics::incrementTasksStates(
+    TaskState state, TaskStatus::Source source, TaskStatus::Reason reason)
+{
+  if (!tasks_states.contains(state)) {
+    tasks_states[state] = SourcesReasons();
+  }
+  if (!tasks_states[state].contains(source)) {
+    tasks_states[state][source] = Reasons();
+  }
+  if (!tasks_states[state][source].contains(reason)) {
+    process::metrics::Counter counter =
+        process::metrics::Counter(
+            "master/" +
+            strings::lower(TaskState_Name(state)) + "/" +
+            strings::lower(TaskStatus::Source_Name(source)) + "/" +
+            strings::lower(TaskStatus::Reason_Name(reason)));
+    tasks_states[state][source].put(reason, counter);
+    process::metrics::add(counter);
+  }
+
+  process::metrics::Counter counter =
+    tasks_states[state][source].get(reason).get();
+  counter++;
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/master/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.hpp b/src/master/metrics.hpp
index ee8b5bc..326eb36 100644
--- a/src/master/metrics.hpp
+++ b/src/master/metrics.hpp
@@ -28,6 +28,8 @@
 
 #include <stout/hashmap.hpp>
 
+#include "mesos/mesos.hpp"
+
 namespace mesos {
 namespace master {
 
@@ -64,6 +66,12 @@ struct Metrics
   process::metrics::Counter tasks_lost;
   process::metrics::Counter tasks_error;
 
+  typedef hashmap<TaskStatus::Reason, process::metrics::Counter> Reasons;
+  typedef hashmap<TaskStatus::Source, Reasons> SourcesReasons;
+
+  // NOTE: We only track metrics sources and reasons for terminal states.
+  hashmap<TaskState, SourcesReasons> tasks_states;
+
   // Message counters.
   process::metrics::Counter dropped_messages;
 
@@ -163,6 +171,9 @@ struct Metrics
   std::vector<process::metrics::Gauge> resources_total;
   std::vector<process::metrics::Gauge> resources_used;
   std::vector<process::metrics::Gauge> resources_percent;
+
+  void incrementTasksStates(
+      TaskState state, TaskStatus::Source source, TaskStatus::Reason reason);
 };
 
 } // namespace master {

http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 3eace81..9de0a46 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -2443,7 +2443,7 @@ void Slave::statusUpdate(const StatusUpdate& update, const UPID& pid)
   if (protobuf::isTerminalState(status.state()) &&
       (executor->queuedTasks.contains(status.task_id()) ||
        executor->launchedTasks.contains(status.task_id()))) {
-    executor->terminateTask(status.task_id(), status.state());
+    executor->terminateTask(status.task_id(), status);
 
     // Wait until the container's resources have been updated before
     // sending the status update.
@@ -4324,7 +4324,7 @@ Task* Executor::addTask(const TaskInfo& task)
 
 void Executor::terminateTask(
     const TaskID& taskId,
-    const mesos::TaskState& state)
+    const mesos::TaskStatus& status)
 {
   VLOG(1) << "Terminating task " << taskId;
 
@@ -4332,7 +4332,7 @@ void Executor::terminateTask(
   // Remove the task if it's queued.
   if (queuedTasks.contains(taskId)) {
     task = new Task(
-        protobuf::createTask(queuedTasks[taskId], state, frameworkId));
+        protobuf::createTask(queuedTasks[taskId], status.state(), frameworkId));
     queuedTasks.erase(taskId);
   } else if (launchedTasks.contains(taskId)) {
     // Update the resources if it's been launched.
@@ -4341,7 +4341,7 @@ void Executor::terminateTask(
     launchedTasks.erase(taskId);
   }
 
-  switch (state) {
+  switch (status.state()) {
     case TASK_FINISHED:
       ++slave->metrics.tasks_finished;
       break;
@@ -4355,10 +4355,12 @@ void Executor::terminateTask(
       ++slave->metrics.tasks_lost;
       break;
     default:
-      LOG(WARNING) << "Unhandled task state " << state << " on completion.";
+      LOG(WARNING) << "Unhandled task state " << status.state()
+                   << " on completion.";
       break;
   }
 
+  // TODO(dhamon): Update source/reason metrics.
   terminatedTasks[taskId] = CHECK_NOTNULL(task);
 }
 
@@ -4441,7 +4443,7 @@ void Executor::recoverTask(const TaskState& state)
     // terminal updates (e.g., when slave recovery is always enabled).
     if (protobuf::isTerminalState(update.status().state()) &&
         launchedTasks.contains(state.id)) {
-      terminateTask(state.id, update.status().state());
+      terminateTask(state.id, update.status());
 
       // If the terminal update has been acknowledged, remove it.
       if (state.acks.contains(UUID::fromBytes(update.uuid()))) {

http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index 93fa4b5..7a399f6 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -491,7 +491,7 @@ struct Executor
   ~Executor();
 
   Task* addTask(const TaskInfo& task);
-  void terminateTask(const TaskID& taskId, const mesos::TaskState& state);
+  void terminateTask(const TaskID& taskId, const mesos::TaskStatus& status);
   void completeTask(const TaskID& taskId);
   void checkpointExecutor();
   void checkpointTask(const TaskInfo& task);

http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/master_authorization_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_authorization_tests.cpp b/src/tests/master_authorization_tests.cpp
index 6fd0efa..4b8c134 100644
--- a/src/tests/master_authorization_tests.cpp
+++ b/src/tests/master_authorization_tests.cpp
@@ -361,6 +361,15 @@ TEST_F(MasterAuthorizationTest, SlaveRemoved)
   // returned to the allocator.
   AWAIT_READY(recoverResources);
 
+  // Check metrics.
+  JSON::Object stats = Metrics();
+  EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+  EXPECT_EQ(1u, stats.values.count(
+                    "master/task_lost/source_master/reason_slave_removed"));
+  EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
+  EXPECT_EQ(
+      1u, stats.values["master/task_lost/source_master/reason_slave_removed"]);
+
   driver.stop();
   driver.join();
 
@@ -447,6 +456,17 @@ TEST_F(MasterAuthorizationTest, SlaveDisconnected)
   // returned to the allocator.
   AWAIT_READY(recoverResources);
 
+  // Check metrics.
+  JSON::Object stats = Metrics();
+  EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+  EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
+  EXPECT_EQ(1u,
+            stats.values.count(
+                "master/task_lost/source_master/reason_slave_disconnected"));
+  EXPECT_EQ(
+      1u,
+      stats.values["master/task_lost/source_master/reason_slave_disconnected"]);
+
   driver.stop();
   driver.join();
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/master_slave_reconciliation_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_slave_reconciliation_tests.cpp b/src/tests/master_slave_reconciliation_tests.cpp
index 0974285..9f9ee6c 100644
--- a/src/tests/master_slave_reconciliation_tests.cpp
+++ b/src/tests/master_slave_reconciliation_tests.cpp
@@ -229,6 +229,18 @@ TEST_F(MasterSlaveReconciliationTest, ReconcileLostTask)
   ASSERT_EQ(task.task_id(), status.get().task_id());
   ASSERT_EQ(TASK_LOST, status.get().state());
 
+  // Check metrics.
+  JSON::Object stats = Metrics();
+  EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+  EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
+  EXPECT_EQ(
+      1u,
+      stats.values.count(
+          "master/task_lost/source_slave/reason_reconciliation"));
+  EXPECT_EQ(
+      1u,
+      stats.values["master/task_lost/source_slave/reason_reconciliation"]);
+
   driver.stop();
   driver.join();
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 18eabd4..5b6c485 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -40,6 +40,7 @@
 #include <stout/net.hpp>
 #include <stout/option.hpp>
 #include <stout/os.hpp>
+#include <stout/strings.hpp>
 #include <stout/try.hpp>
 
 #include "master/allocator.hpp"
@@ -1364,6 +1365,19 @@ TEST_F(MasterTest, LaunchAcrossSlavesTest)
   EXPECT_CALL(exec, shutdown(_))
     .Times(AtMost(1));
 
+  // Check metrics.
+  JSON::Object stats = Metrics();
+  std::cout << stats << '\n';
+  EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+  EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
+  EXPECT_EQ(
+      1u,
+      stats.values.count(
+          "master/task_lost/source_master/reason_invalid_offers"));
+  EXPECT_EQ(
+      1u,
+      stats.values["master/task_lost/source_master/reason_invalid_offers"]);
+
   driver.stop();
   driver.join();
 
@@ -1444,6 +1458,18 @@ TEST_F(MasterTest, LaunchDuplicateOfferTest)
   EXPECT_CALL(exec, shutdown(_))
     .Times(AtMost(1));
 
+  // Check metrics.
+  JSON::Object stats = Metrics();
+  EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+  EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
+  EXPECT_EQ(
+      1u,
+      stats.values.count(
+          "master/task_lost/source_master/reason_invalid_offers"));
+  EXPECT_EQ(
+      1u,
+      stats.values["master/task_lost/source_master/reason_invalid_offers"]);
+
   driver.stop();
   driver.join();
 
@@ -1499,6 +1525,8 @@ TEST_F(MasterTest, MetricsInStatsEndpoint)
   EXPECT_EQ(1u, stats.values.count("master/tasks_killed"));
   EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
 
+  // TODO(dhamon): Add expectations for task source reason metrics.
+
   EXPECT_EQ(1u, stats.values.count("master/dropped_messages"));
 
   // Messages from schedulers.

http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/mesos.cpp
----------------------------------------------------------------------
diff --git a/src/tests/mesos.cpp b/src/tests/mesos.cpp
index 21a4053..0b4637d 100644
--- a/src/tests/mesos.cpp
+++ b/src/tests/mesos.cpp
@@ -345,6 +345,25 @@ void MesosTest::ShutdownSlaves()
 }
 
 
+JSON::Object MesosTest::Metrics() const
+{
+  process::UPID upid("metrics", process::address());
+
+  process::Future<process::http::Response> response =
+      process::http::get(upid, "snapshot");
+  AWAIT_EXPECT_RESPONSE_STATUS_EQ(process::http::OK().status, response);
+
+  EXPECT_SOME_EQ(
+      "application/json",
+      response.get().headers.get("Content-Type"));
+
+  Try<JSON::Object> parse = JSON::parse<JSON::Object>(response.get().body);
+  CHECK_SOME(parse);
+
+  return parse.get();
+}
+
+
 MockSlave::MockSlave(const slave::Flags& flags,
                      MasterDetector* detector,
                      slave::Containerizer* containerizer)

http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/mesos.hpp
----------------------------------------------------------------------
diff --git a/src/tests/mesos.hpp b/src/tests/mesos.hpp
index 77992ac..2b0c90d 100644
--- a/src/tests/mesos.hpp
+++ b/src/tests/mesos.hpp
@@ -178,6 +178,9 @@ protected:
   // Stop all slaves.
   virtual void ShutdownSlaves();
 
+  // Get the metrics snapshot.
+  JSON::Object Metrics() const;
+
   Cluster cluster;
 
   // Containerizer(s) created during test that we need to cleanup.

http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/rate_limiting_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/rate_limiting_tests.cpp b/src/tests/rate_limiting_tests.cpp
index 8b55bff..8114c79 100644
--- a/src/tests/rate_limiting_tests.cpp
+++ b/src/tests/rate_limiting_tests.cpp
@@ -55,6 +55,7 @@ namespace master {
 
 // Query Mesos metrics snapshot endpoint and return a JSON::Object
 // result.
+// TODO(dhamon): Kill this in favour of Metrics() in mesos.hpp
 #define METRICS_SNAPSHOT                                                       \
   ({ Future<process::http::Response> response =                                \
        process::http::get(MetricsProcess::instance()->self(), "snapshot");     \

http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 68a6498..a02e335 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -845,6 +845,8 @@ TEST_F(SlaveTest, MetricsInStatsEndpoint)
   EXPECT_EQ(1u, stats.values.count("slave/tasks_killed"));
   EXPECT_EQ(1u, stats.values.count("slave/tasks_lost"));
 
+  // TODO(dhamon): Add expectations for task source reason metrics.
+
   EXPECT_EQ(1u, stats.values.count("slave/executors_registering"));
   EXPECT_EQ(1u, stats.values.count("slave/executors_running"));
   EXPECT_EQ(1u, stats.values.count("slave/executors_terminating"));