You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by dm...@apache.org on 2015/02/11 00:22:12 UTC
mesos git commit: Update Master metrics to match task source and
reason scheme.
Repository: mesos
Updated Branches:
refs/heads/master 6936a6d76 -> 4ec71437e
Update Master metrics to match task source and reason scheme.
Review: https://reviews.apache.org/r/27531/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/4ec71437
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/4ec71437
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/4ec71437
Branch: refs/heads/master
Commit: 4ec71437e890680a31b33ac8ce964657e9963837
Parents: 6936a6d
Author: Dominic Hamon <dh...@twitter.com>
Authored: Thu Oct 16 11:47:43 2014 -0700
Committer: Dominic Hamon <dh...@twitter.com>
Committed: Tue Feb 10 15:12:06 2015 -0800
----------------------------------------------------------------------
src/master/master.cpp | 39 +++++++++++++++++---
src/master/master.hpp | 2 +
src/master/metrics.cpp | 36 +++++++++++++++++-
src/master/metrics.hpp | 11 ++++++
src/slave/slave.cpp | 14 ++++---
src/slave/slave.hpp | 2 +-
src/tests/master_authorization_tests.cpp | 20 ++++++++++
src/tests/master_slave_reconciliation_tests.cpp | 12 ++++++
src/tests/master_tests.cpp | 28 ++++++++++++++
src/tests/mesos.cpp | 19 ++++++++++
src/tests/mesos.hpp | 3 ++
src/tests/rate_limiting_tests.cpp | 1 +
src/tests/slave_tests.cpp | 2 +
13 files changed, 176 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 6affd24..dd594e8 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -2249,6 +2249,11 @@ void Master::accept(
metrics->tasks_lost++;
stats.tasks[TASK_LOST]++;
+ metrics->incrementTasksStates(
+ TASK_LOST,
+ TaskStatus::SOURCE_MASTER,
+ TaskStatus::REASON_INVALID_OFFERS);
+
forward(update, UPID(), framework);
}
}
@@ -2339,6 +2344,9 @@ void Master::_accept(
}
foreach (const TaskInfo& task, operation.launch().task_infos()) {
+ const TaskStatus::Reason reason =
+ slave == NULL ? TaskStatus::REASON_SLAVE_REMOVED
+ : TaskStatus::REASON_SLAVE_DISCONNECTED;
const StatusUpdate& update = protobuf::createStatusUpdate(
framework->id,
task.slave_id(),
@@ -2346,13 +2354,16 @@ void Master::_accept(
TASK_LOST,
TaskStatus::SOURCE_MASTER,
slave == NULL ? "Slave removed" : "Slave disconnected",
- slave == NULL ?
- TaskStatus::REASON_SLAVE_REMOVED :
- TaskStatus::REASON_SLAVE_DISCONNECTED);
+ reason);
metrics->tasks_lost++;
stats.tasks[TASK_LOST]++;
+ metrics->incrementTasksStates(
+ TASK_LOST,
+ TaskStatus::SOURCE_MASTER,
+ reason);
+
forward(update, UPID(), framework);
}
}
@@ -2484,6 +2495,11 @@ void Master::_accept(
metrics->tasks_error++;
stats.tasks[TASK_ERROR]++;
+ metrics->incrementTasksStates(
+ TASK_ERROR,
+ TaskStatus::SOURCE_MASTER,
+ TaskStatus::REASON_TASK_UNAUTHORIZED);
+
forward(update, UPID(), framework);
continue;
@@ -2509,6 +2525,11 @@ void Master::_accept(
metrics->tasks_error++;
stats.tasks[TASK_ERROR]++;
+ metrics->incrementTasksStates(
+ TASK_ERROR,
+ TaskStatus::SOURCE_MASTER,
+ TaskStatus::REASON_TASK_INVALID);
+
forward(update, UPID(), framework);
continue;
@@ -4619,12 +4640,20 @@ void Master::updateTask(Task* task, const StatusUpdate& update)
framework->taskTerminated(task);
}
- switch (task->state()) {
+ switch (status.state()) {
case TASK_FINISHED: ++metrics->tasks_finished; break;
case TASK_FAILED: ++metrics->tasks_failed; break;
case TASK_KILLED: ++metrics->tasks_killed; break;
case TASK_LOST: ++metrics->tasks_lost; break;
- default: break;
+ case TASK_ERROR: ++metrics->tasks_error; break;
+ default: break;
+ }
+
+ if (status.has_reason()) {
+ metrics->incrementTasksStates(
+ status.state(),
+ status.source(),
+ status.reason());
}
}
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 9433b0c..6a39df0 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -673,6 +673,8 @@ private:
// NOTE: It is safe to use a 'shared_ptr' because 'Metrics' is
// thread safe.
+ // TODO(dhamon): Does this need to be a shared_ptr? Metrics contains copyable
+ // metric types only.
memory::shared_ptr<Metrics> metrics;
// Gauge handlers.
http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/master/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.cpp b/src/master/metrics.cpp
index 0b3b91e..3ed84e2 100644
--- a/src/master/metrics.cpp
+++ b/src/master/metrics.cpp
@@ -21,7 +21,6 @@
#include "master/master.hpp"
#include "master/metrics.hpp"
-
namespace mesos {
namespace master {
@@ -342,6 +341,41 @@ Metrics::~Metrics()
process::metrics::remove(gauge);
}
resources_percent.clear();
+
+ foreachvalue (const auto& source_reason, tasks_states) {
+ foreachvalue (const auto& reason_counter, source_reason) {
+ foreachvalue (const process::metrics::Counter& counter, reason_counter) {
+ process::metrics::remove(counter);
+ }
+ }
+ }
+ tasks_states.clear();
+}
+
+
+void Metrics::incrementTasksStates(
+ TaskState state, TaskStatus::Source source, TaskStatus::Reason reason)
+{
+ if (!tasks_states.contains(state)) {
+ tasks_states[state] = SourcesReasons();
+ }
+ if (!tasks_states[state].contains(source)) {
+ tasks_states[state][source] = Reasons();
+ }
+ if (!tasks_states[state][source].contains(reason)) {
+ process::metrics::Counter counter =
+ process::metrics::Counter(
+ "master/" +
+ strings::lower(TaskState_Name(state)) + "/" +
+ strings::lower(TaskStatus::Source_Name(source)) + "/" +
+ strings::lower(TaskStatus::Reason_Name(reason)));
+ tasks_states[state][source].put(reason, counter);
+ process::metrics::add(counter);
+ }
+
+ process::metrics::Counter counter =
+ tasks_states[state][source].get(reason).get();
+ counter++;
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/master/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.hpp b/src/master/metrics.hpp
index ee8b5bc..326eb36 100644
--- a/src/master/metrics.hpp
+++ b/src/master/metrics.hpp
@@ -28,6 +28,8 @@
#include <stout/hashmap.hpp>
+#include "mesos/mesos.hpp"
+
namespace mesos {
namespace master {
@@ -64,6 +66,12 @@ struct Metrics
process::metrics::Counter tasks_lost;
process::metrics::Counter tasks_error;
+ typedef hashmap<TaskStatus::Reason, process::metrics::Counter> Reasons;
+ typedef hashmap<TaskStatus::Source, Reasons> SourcesReasons;
+
+ // NOTE: We only track metrics sources and reasons for terminal states.
+ hashmap<TaskState, SourcesReasons> tasks_states;
+
// Message counters.
process::metrics::Counter dropped_messages;
@@ -163,6 +171,9 @@ struct Metrics
std::vector<process::metrics::Gauge> resources_total;
std::vector<process::metrics::Gauge> resources_used;
std::vector<process::metrics::Gauge> resources_percent;
+
+ void incrementTasksStates(
+ TaskState state, TaskStatus::Source source, TaskStatus::Reason reason);
};
} // namespace master {
http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 3eace81..9de0a46 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -2443,7 +2443,7 @@ void Slave::statusUpdate(const StatusUpdate& update, const UPID& pid)
if (protobuf::isTerminalState(status.state()) &&
(executor->queuedTasks.contains(status.task_id()) ||
executor->launchedTasks.contains(status.task_id()))) {
- executor->terminateTask(status.task_id(), status.state());
+ executor->terminateTask(status.task_id(), status);
// Wait until the container's resources have been updated before
// sending the status update.
@@ -4324,7 +4324,7 @@ Task* Executor::addTask(const TaskInfo& task)
void Executor::terminateTask(
const TaskID& taskId,
- const mesos::TaskState& state)
+ const mesos::TaskStatus& status)
{
VLOG(1) << "Terminating task " << taskId;
@@ -4332,7 +4332,7 @@ void Executor::terminateTask(
// Remove the task if it's queued.
if (queuedTasks.contains(taskId)) {
task = new Task(
- protobuf::createTask(queuedTasks[taskId], state, frameworkId));
+ protobuf::createTask(queuedTasks[taskId], status.state(), frameworkId));
queuedTasks.erase(taskId);
} else if (launchedTasks.contains(taskId)) {
// Update the resources if it's been launched.
@@ -4341,7 +4341,7 @@ void Executor::terminateTask(
launchedTasks.erase(taskId);
}
- switch (state) {
+ switch (status.state()) {
case TASK_FINISHED:
++slave->metrics.tasks_finished;
break;
@@ -4355,10 +4355,12 @@ void Executor::terminateTask(
++slave->metrics.tasks_lost;
break;
default:
- LOG(WARNING) << "Unhandled task state " << state << " on completion.";
+ LOG(WARNING) << "Unhandled task state " << status.state()
+ << " on completion.";
break;
}
+ // TODO(dhamon): Update source/reason metrics.
terminatedTasks[taskId] = CHECK_NOTNULL(task);
}
@@ -4441,7 +4443,7 @@ void Executor::recoverTask(const TaskState& state)
// terminal updates (e.g., when slave recovery is always enabled).
if (protobuf::isTerminalState(update.status().state()) &&
launchedTasks.contains(state.id)) {
- terminateTask(state.id, update.status().state());
+ terminateTask(state.id, update.status());
// If the terminal update has been acknowledged, remove it.
if (state.acks.contains(UUID::fromBytes(update.uuid()))) {
http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index 93fa4b5..7a399f6 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -491,7 +491,7 @@ struct Executor
~Executor();
Task* addTask(const TaskInfo& task);
- void terminateTask(const TaskID& taskId, const mesos::TaskState& state);
+ void terminateTask(const TaskID& taskId, const mesos::TaskStatus& status);
void completeTask(const TaskID& taskId);
void checkpointExecutor();
void checkpointTask(const TaskInfo& task);
http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/master_authorization_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_authorization_tests.cpp b/src/tests/master_authorization_tests.cpp
index 6fd0efa..4b8c134 100644
--- a/src/tests/master_authorization_tests.cpp
+++ b/src/tests/master_authorization_tests.cpp
@@ -361,6 +361,15 @@ TEST_F(MasterAuthorizationTest, SlaveRemoved)
// returned to the allocator.
AWAIT_READY(recoverResources);
+ // Check metrics.
+ JSON::Object stats = Metrics();
+ EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+ EXPECT_EQ(1u, stats.values.count(
+ "master/task_lost/source_master/reason_slave_removed"));
+ EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
+ EXPECT_EQ(
+ 1u, stats.values["master/task_lost/source_master/reason_slave_removed"]);
+
driver.stop();
driver.join();
@@ -447,6 +456,17 @@ TEST_F(MasterAuthorizationTest, SlaveDisconnected)
// returned to the allocator.
AWAIT_READY(recoverResources);
+ // Check metrics.
+ JSON::Object stats = Metrics();
+ EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+ EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
+ EXPECT_EQ(1u,
+ stats.values.count(
+ "master/task_lost/source_master/reason_slave_disconnected"));
+ EXPECT_EQ(
+ 1u,
+ stats.values["master/task_lost/source_master/reason_slave_disconnected"]);
+
driver.stop();
driver.join();
http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/master_slave_reconciliation_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_slave_reconciliation_tests.cpp b/src/tests/master_slave_reconciliation_tests.cpp
index 0974285..9f9ee6c 100644
--- a/src/tests/master_slave_reconciliation_tests.cpp
+++ b/src/tests/master_slave_reconciliation_tests.cpp
@@ -229,6 +229,18 @@ TEST_F(MasterSlaveReconciliationTest, ReconcileLostTask)
ASSERT_EQ(task.task_id(), status.get().task_id());
ASSERT_EQ(TASK_LOST, status.get().state());
+ // Check metrics.
+ JSON::Object stats = Metrics();
+ EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+ EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
+ EXPECT_EQ(
+ 1u,
+ stats.values.count(
+ "master/task_lost/source_slave/reason_reconciliation"));
+ EXPECT_EQ(
+ 1u,
+ stats.values["master/task_lost/source_slave/reason_reconciliation"]);
+
driver.stop();
driver.join();
http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 18eabd4..5b6c485 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -40,6 +40,7 @@
#include <stout/net.hpp>
#include <stout/option.hpp>
#include <stout/os.hpp>
+#include <stout/strings.hpp>
#include <stout/try.hpp>
#include "master/allocator.hpp"
@@ -1364,6 +1365,19 @@ TEST_F(MasterTest, LaunchAcrossSlavesTest)
EXPECT_CALL(exec, shutdown(_))
.Times(AtMost(1));
+ // Check metrics.
+ JSON::Object stats = Metrics();
+ std::cout << stats << '\n';
+ EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+ EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
+ EXPECT_EQ(
+ 1u,
+ stats.values.count(
+ "master/task_lost/source_master/reason_invalid_offers"));
+ EXPECT_EQ(
+ 1u,
+ stats.values["master/task_lost/source_master/reason_invalid_offers"]);
+
driver.stop();
driver.join();
@@ -1444,6 +1458,18 @@ TEST_F(MasterTest, LaunchDuplicateOfferTest)
EXPECT_CALL(exec, shutdown(_))
.Times(AtMost(1));
+ // Check metrics.
+ JSON::Object stats = Metrics();
+ EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+ EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
+ EXPECT_EQ(
+ 1u,
+ stats.values.count(
+ "master/task_lost/source_master/reason_invalid_offers"));
+ EXPECT_EQ(
+ 1u,
+ stats.values["master/task_lost/source_master/reason_invalid_offers"]);
+
driver.stop();
driver.join();
@@ -1499,6 +1525,8 @@ TEST_F(MasterTest, MetricsInStatsEndpoint)
EXPECT_EQ(1u, stats.values.count("master/tasks_killed"));
EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+ // TODO(dhamon): Add expectations for task source reason metrics.
+
EXPECT_EQ(1u, stats.values.count("master/dropped_messages"));
// Messages from schedulers.
http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/mesos.cpp
----------------------------------------------------------------------
diff --git a/src/tests/mesos.cpp b/src/tests/mesos.cpp
index 21a4053..0b4637d 100644
--- a/src/tests/mesos.cpp
+++ b/src/tests/mesos.cpp
@@ -345,6 +345,25 @@ void MesosTest::ShutdownSlaves()
}
+JSON::Object MesosTest::Metrics() const
+{
+ process::UPID upid("metrics", process::address());
+
+ process::Future<process::http::Response> response =
+ process::http::get(upid, "snapshot");
+ AWAIT_EXPECT_RESPONSE_STATUS_EQ(process::http::OK().status, response);
+
+ EXPECT_SOME_EQ(
+ "application/json",
+ response.get().headers.get("Content-Type"));
+
+ Try<JSON::Object> parse = JSON::parse<JSON::Object>(response.get().body);
+ CHECK_SOME(parse);
+
+ return parse.get();
+}
+
+
MockSlave::MockSlave(const slave::Flags& flags,
MasterDetector* detector,
slave::Containerizer* containerizer)
http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/mesos.hpp
----------------------------------------------------------------------
diff --git a/src/tests/mesos.hpp b/src/tests/mesos.hpp
index 77992ac..2b0c90d 100644
--- a/src/tests/mesos.hpp
+++ b/src/tests/mesos.hpp
@@ -178,6 +178,9 @@ protected:
// Stop all slaves.
virtual void ShutdownSlaves();
+ // Get the metrics snapshot.
+ JSON::Object Metrics() const;
+
Cluster cluster;
// Containerizer(s) created during test that we need to cleanup.
http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/rate_limiting_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/rate_limiting_tests.cpp b/src/tests/rate_limiting_tests.cpp
index 8b55bff..8114c79 100644
--- a/src/tests/rate_limiting_tests.cpp
+++ b/src/tests/rate_limiting_tests.cpp
@@ -55,6 +55,7 @@ namespace master {
// Query Mesos metrics snapshot endpoint and return a JSON::Object
// result.
+// TODO(dhamon): Kill this in favour of Metrics() in mesos.hpp
#define METRICS_SNAPSHOT \
({ Future<process::http::Response> response = \
process::http::get(MetricsProcess::instance()->self(), "snapshot"); \
http://git-wip-us.apache.org/repos/asf/mesos/blob/4ec71437/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 68a6498..a02e335 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -845,6 +845,8 @@ TEST_F(SlaveTest, MetricsInStatsEndpoint)
EXPECT_EQ(1u, stats.values.count("slave/tasks_killed"));
EXPECT_EQ(1u, stats.values.count("slave/tasks_lost"));
+ // TODO(dhamon): Add expectations for task source reason metrics.
+
EXPECT_EQ(1u, stats.values.count("slave/executors_registering"));
EXPECT_EQ(1u, stats.values.count("slave/executors_running"));
EXPECT_EQ(1u, stats.values.count("slave/executors_terminating"));