You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2013/08/14 20:07:31 UTC
[08/18] git commit: Fixed slave to not recover terminated executors.
Fixed slave to not recover terminated executors.
Review: https://reviews.apache.org/r/13450
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/649295f3
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/649295f3
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/649295f3
Branch: refs/heads/master
Commit: 649295f34d6b7a70314a52cd6db3f74208941b98
Parents: fd3584a
Author: Vinod Kone <vi...@twitter.com>
Authored: Fri Aug 9 14:27:02 2013 -0700
Committer: Vinod Kone <vi...@twitter.com>
Committed: Tue Aug 13 14:35:59 2013 -0700
----------------------------------------------------------------------
src/slave/monitor.cpp | 2 +-
src/slave/slave.cpp | 13 +++++++++++--
src/slave/status_update_manager.cpp | 13 ++++++-------
src/slave/status_update_manager.hpp | 2 +-
4 files changed, 19 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/649295f3/src/slave/monitor.cpp
----------------------------------------------------------------------
diff --git a/src/slave/monitor.cpp b/src/slave/monitor.cpp
index 4f3c91f..8e1eb35 100644
--- a/src/slave/monitor.cpp
+++ b/src/slave/monitor.cpp
@@ -183,7 +183,7 @@ void ResourceMonitorProcess::_collect(
} else {
// Note that the isolator might have been terminated and pending
// dispatches deleted, causing the future to get discarded.
- LOG(WARNING)
+ VLOG(1)
<< "Failed to collect resource usage for executor '" << executorId
<< "' of framework '" << frameworkId << "': "
<< (statistics.isFailed() ? statistics.failure() : "Future discarded");
http://git-wip-us.apache.org/repos/asf/mesos/blob/649295f3/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 5fa1fa7..e8176d2 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -2348,8 +2348,8 @@ void _unwatch(
{
if (!unwatch.isReady()) {
LOG(ERROR) << "Failed to unwatch executor " << executorId
- << " of framework " << frameworkId
- << ": " << unwatch.isFailed() ? unwatch.failure() : "discarded";
+ << " of framework " << frameworkId << ": "
+ << (unwatch.isFailed() ? unwatch.failure() : "discarded");
}
}
@@ -2619,6 +2619,15 @@ Future<Nothing> Slave::_recover(const SlaveState& state, bool reconnect)
{
foreachvalue(Framework* framework, frameworks){
foreachvalue(Executor* executor, framework->executors) {
+ // If the executor is already terminating/terminated don't
+ // bother reconnecting or killing it. This could happen if
+ // the recovered isolator sent a 'ExecutorTerminated' message
+ // before the slave is here.
+ if (executor->state == Executor::TERMINATING ||
+ executor->state == Executor::TERMINATED) {
+ continue;
+ }
+
// Monitor the executor.
monitor.watch(
framework->id,
http://git-wip-us.apache.org/repos/asf/mesos/blob/649295f3/src/slave/status_update_manager.cpp
----------------------------------------------------------------------
diff --git a/src/slave/status_update_manager.cpp b/src/slave/status_update_manager.cpp
index ffd4736..6d4598e 100644
--- a/src/slave/status_update_manager.cpp
+++ b/src/slave/status_update_manager.cpp
@@ -297,8 +297,7 @@ Future<Nothing> StatusUpdateManagerProcess::_update(
const TaskID& taskId = update.status().task_id();
const FrameworkID& frameworkId = update.framework_id();
- LOG(INFO) << "Received status update " << update
- << " with checkpoint=" << stringify(checkpoint);
+ LOG(INFO) << "Received status update " << update;
// Write the status update to disk and enqueue it to send it to the master.
// Create/Get the status update stream for this task.
@@ -457,8 +456,8 @@ StatusUpdateStream* StatusUpdateManagerProcess::createStatusUpdateStream(
const Option<ExecutorID>& executorId,
const Option<UUID>& uuid)
{
- LOG(INFO) << "Creating StatusUpdate stream for task " << taskId
- << " of framework " << frameworkId;
+ VLOG(1) << "Creating StatusUpdate stream for task " << taskId
+ << " of framework " << frameworkId;
StatusUpdateStream* stream = new StatusUpdateStream(
taskId, frameworkId, slaveId, flags, checkpoint, executorId, uuid);
@@ -488,9 +487,9 @@ void StatusUpdateManagerProcess::cleanupStatusUpdateStream(
const TaskID& taskId,
const FrameworkID& frameworkId)
{
- LOG(INFO) << "Cleaning up status update stream"
- << " for task " << taskId
- << " of framework " << frameworkId;
+ VLOG(1) << "Cleaning up status update stream"
+ << " for task " << taskId
+ << " of framework " << frameworkId;
CHECK(streams.contains(frameworkId))
<< "Cannot find the status update streams for framework " << frameworkId;
http://git-wip-us.apache.org/repos/asf/mesos/blob/649295f3/src/slave/status_update_manager.hpp
----------------------------------------------------------------------
diff --git a/src/slave/status_update_manager.hpp b/src/slave/status_update_manager.hpp
index da92760..ffc79ae 100644
--- a/src/slave/status_update_manager.hpp
+++ b/src/slave/status_update_manager.hpp
@@ -285,7 +285,7 @@ struct StatusUpdateStream
return Error(error.get());
}
- LOG(INFO) << "Replaying status update stream for task " << taskId;
+ VLOG(1) << "Replaying status update stream for task " << taskId;
foreach (const StatusUpdate& update, updates) {
// Handle the update.