You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2015/05/14 01:59:27 UTC
[3/4] mesos git commit: Added reason metrics for slave removals.
Added reason metrics for slave removals.
Review: https://reviews.apache.org/r/33154
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/cc395509
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/cc395509
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/cc395509
Branch: refs/heads/master
Commit: cc395509a80facbdb22102920430b7f0e04bbc03
Parents: 23a51be
Author: Benjamin Mahler <be...@gmail.com>
Authored: Mon Apr 13 17:19:16 2015 -0700
Committer: Benjamin Mahler <be...@gmail.com>
Committed: Wed May 13 16:45:30 2015 -0700
----------------------------------------------------------------------
include/mesos/mesos.proto | 7 +++++--
src/master/master.cpp | 41 ++++++++++++++++++++++++++++++-----------
src/master/master.hpp | 14 ++++++++++++--
src/master/metrics.cpp | 12 ++++++++++++
src/master/metrics.hpp | 3 +++
5 files changed, 62 insertions(+), 15 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/cc395509/include/mesos/mesos.proto
----------------------------------------------------------------------
diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto
index db4fc8c..15f55a3 100644
--- a/include/mesos/mesos.proto
+++ b/include/mesos/mesos.proto
@@ -753,14 +753,17 @@ enum TaskState {
* Describes the current status of a task.
*/
message TaskStatus {
- /** Describes the source of the task status update. */
+ // Describes the source of the task status update.
enum Source {
SOURCE_MASTER = 0;
SOURCE_SLAVE = 1;
SOURCE_EXECUTOR = 2;
}
- /** Detailed reason for the task status update. */
+ // Detailed reason for the task status update.
+ //
+ // TODO(bmahler): Differentiate between slave removal reasons
+ // (e.g. unhealthy vs. unregistered for maintenance).
enum Reason {
REASON_COMMAND_EXECUTOR_FAILED = 0;
REASON_EXECUTOR_TERMINATED = 1;
http://git-wip-us.apache.org/repos/asf/mesos/blob/cc395509/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index ec32cd6..eaea79d 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -993,7 +993,8 @@ void Master::exited(const UPID& pid)
// Remove the slave, if it is not checkpointing.
LOG(INFO) << "Removing disconnected slave " << *slave
<< " because it is not checkpointing!";
- removeSlave(slave);
+ removeSlave(slave,
+ "slave is non-checkpointing and disconnected");
return;
} else if (slave->connected) {
// Checkpointing slaves can just be disconnected.
@@ -1380,7 +1381,9 @@ Nothing Master::removeSlave(const Registry::Slave& slave)
&Self::_removeSlave,
slave.info(),
vector<StatusUpdate>(), // No TASK_LOST updates to send.
- lambda::_1));
+ lambda::_1,
+ "did not re-register after master failover",
+ metrics->slave_removals_reason_unhealthy));
} else {
// When a non-strict registry is in use, we want to ensure the
// registry is used in a write-only manner. Therefore we remove
@@ -3101,7 +3104,9 @@ void Master::registerSlave(
LOG(INFO)
<< "Removing old disconnected slave " << *slave
<< " because a registration attempt is being made from " << from;
- removeSlave(slave);
+ removeSlave(slave,
+ "a new slave registered at the same address",
+ metrics->slave_removals_reason_registered);
break;
} else {
CHECK(slave->active)
@@ -3437,7 +3442,9 @@ void Master::unregisterSlave(const UPID& from, const SlaveID& slaveId)
<< " because it is not the slave " << slave->pid;
return;
}
- removeSlave(slave);
+ removeSlave(slave,
+ "the slave unregistered",
+ metrics->slave_removals_reason_unregistered);
}
}
@@ -3646,7 +3653,7 @@ void Master::shutdownSlave(const SlaveID& slaveId, const string& message)
message_.set_message(message);
send(slave->pid, message_);
- removeSlave(slave);
+ removeSlave(slave, message, metrics->slave_removals_reason_unhealthy);
}
@@ -4664,11 +4671,14 @@ void Master::addSlave(
}
-void Master::removeSlave(Slave* slave)
+void Master::removeSlave(
+ Slave* slave,
+ const string& message,
+ Option<Counter> reason)
{
CHECK_NOTNULL(slave);
- LOG(INFO) << "Removing slave " << *slave;
+ LOG(INFO) << "Removing slave " << *slave << ": " << message;
// We want to remove the slave first, to avoid the allocator
// re-allocating the recovered resources.
@@ -4692,7 +4702,7 @@ void Master::removeSlave(Slave* slave)
task->task_id(),
TASK_LOST,
TaskStatus::SOURCE_MASTER,
- "Slave " + slave->info.hostname() + " removed",
+ "Slave " + slave->info.hostname() + " removed: " + message,
TaskStatus::REASON_SLAVE_REMOVED,
(task->has_executor_id() ?
Option<ExecutorID>(task->executor_id()) : None()));
@@ -4743,7 +4753,9 @@ void Master::removeSlave(Slave* slave)
&Self::_removeSlave,
slave->info,
updates,
- lambda::_1));
+ lambda::_1,
+ message,
+ reason));
delete slave;
}
@@ -4752,7 +4764,9 @@ void Master::removeSlave(Slave* slave)
void Master::_removeSlave(
const SlaveInfo& slaveInfo,
const vector<StatusUpdate>& updates,
- const Future<bool>& removed)
+ const Future<bool>& removed,
+ const string& message,
+ Option<Counter> reason)
{
slaves.removing.erase(slaveInfo.id());
@@ -4769,9 +4783,14 @@ void Master::_removeSlave(
<< "already removed from the registrar";
LOG(INFO) << "Removed slave " << slaveInfo.id() << " ("
- << slaveInfo.hostname() << ")";
+ << slaveInfo.hostname() << "): " << message;
+
++metrics->slave_removals;
+ if (reason.isSome()) {
+ ++utils::copy(reason.get()); // Remove const.
+ }
+
// Forward the LOST updates on to the framework.
foreach (const StatusUpdate& update, updates) {
Framework* framework = getFramework(update.framework_id());
http://git-wip-us.apache.org/repos/asf/mesos/blob/cc395509/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index af41216..da0a835 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -44,6 +44,8 @@
#include <process/protobuf.hpp>
#include <process/timer.hpp>
+#include <process/metrics/counter.hpp>
+
#include <stout/cache.hpp>
#include <stout/foreach.hpp>
#include <stout/hashmap.hpp>
@@ -363,11 +365,19 @@ protected:
Nothing removeSlave(const Registry::Slave& slave);
// Remove the slave from the registrar and from the master's state.
- void removeSlave(Slave* slave);
+ //
+ // TODO(bmahler): 'reason' is optional until MESOS-2317 is resolved.
+ void removeSlave(
+ Slave* slave,
+ const std::string& message,
+ Option<process::metrics::Counter> reason = None());
+
void _removeSlave(
const SlaveInfo& slaveInfo,
const std::vector<StatusUpdate>& updates,
- const process::Future<bool>& removed);
+ const process::Future<bool>& removed,
+ const std::string& message,
+ Option<process::metrics::Counter> reason = None());
// Authorizes the task.
// Returns true if task is authorized.
http://git-wip-us.apache.org/repos/asf/mesos/blob/cc395509/src/master/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.cpp b/src/master/metrics.cpp
index 973f051..ee09664 100644
--- a/src/master/metrics.cpp
+++ b/src/master/metrics.cpp
@@ -148,6 +148,12 @@ Metrics::Metrics(const Master& master)
"master/slave_reregistrations"),
slave_removals(
"master/slave_removals"),
+ slave_removals_reason_unhealthy(
+ "master/slave_removals/reason_unhealthy"),
+ slave_removals_reason_unregistered(
+ "master/slave_removals/reason_unregistered"),
+ slave_removals_reason_registered(
+ "master/slave_removals/reason_registered"),
slave_shutdowns_scheduled(
"master/slave_shutdowns_scheduled"),
slave_shutdowns_completed(
@@ -224,6 +230,9 @@ Metrics::Metrics(const Master& master)
process::metrics::add(slave_registrations);
process::metrics::add(slave_reregistrations);
process::metrics::add(slave_removals);
+ process::metrics::add(slave_removals_reason_unhealthy);
+ process::metrics::add(slave_removals_reason_unregistered);
+ process::metrics::add(slave_removals_reason_registered);
process::metrics::add(slave_shutdowns_scheduled);
process::metrics::add(slave_shutdowns_completed);
@@ -327,6 +336,9 @@ Metrics::~Metrics()
process::metrics::remove(slave_registrations);
process::metrics::remove(slave_reregistrations);
process::metrics::remove(slave_removals);
+ process::metrics::remove(slave_removals_reason_unhealthy);
+ process::metrics::remove(slave_removals_reason_unregistered);
+ process::metrics::remove(slave_removals_reason_registered);
process::metrics::remove(slave_shutdowns_scheduled);
process::metrics::remove(slave_shutdowns_completed);
http://git-wip-us.apache.org/repos/asf/mesos/blob/cc395509/src/master/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.hpp b/src/master/metrics.hpp
index ee3982e..78d0666 100644
--- a/src/master/metrics.hpp
+++ b/src/master/metrics.hpp
@@ -163,6 +163,9 @@ struct Metrics
process::metrics::Counter slave_registrations;
process::metrics::Counter slave_reregistrations;
process::metrics::Counter slave_removals;
+ process::metrics::Counter slave_removals_reason_unhealthy;
+ process::metrics::Counter slave_removals_reason_unregistered;
+ process::metrics::Counter slave_removals_reason_registered;
// Slave observer metrics.
process::metrics::Counter slave_shutdowns_scheduled;