You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2013/08/14 20:07:31 UTC

[08/18] git commit: Fixed slave to not recover terminated executors.

Fixed slave to not recover terminated executors.

Review: https://reviews.apache.org/r/13450


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/649295f3
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/649295f3
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/649295f3

Branch: refs/heads/master
Commit: 649295f34d6b7a70314a52cd6db3f74208941b98
Parents: fd3584a
Author: Vinod Kone <vi...@twitter.com>
Authored: Fri Aug 9 14:27:02 2013 -0700
Committer: Vinod Kone <vi...@twitter.com>
Committed: Tue Aug 13 14:35:59 2013 -0700

----------------------------------------------------------------------
 src/slave/monitor.cpp               |  2 +-
 src/slave/slave.cpp                 | 13 +++++++++++--
 src/slave/status_update_manager.cpp | 13 ++++++-------
 src/slave/status_update_manager.hpp |  2 +-
 4 files changed, 19 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/649295f3/src/slave/monitor.cpp
----------------------------------------------------------------------
diff --git a/src/slave/monitor.cpp b/src/slave/monitor.cpp
index 4f3c91f..8e1eb35 100644
--- a/src/slave/monitor.cpp
+++ b/src/slave/monitor.cpp
@@ -183,7 +183,7 @@ void ResourceMonitorProcess::_collect(
   } else {
     // Note that the isolator might have been terminated and pending
     // dispatches deleted, causing the future to get discarded.
-    LOG(WARNING)
+    VLOG(1)
       << "Failed to collect resource usage for executor '" << executorId
       << "' of framework '" << frameworkId << "': "
       << (statistics.isFailed() ? statistics.failure() : "Future discarded");

http://git-wip-us.apache.org/repos/asf/mesos/blob/649295f3/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 5fa1fa7..e8176d2 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -2348,8 +2348,8 @@ void _unwatch(
 {
   if (!unwatch.isReady()) {
     LOG(ERROR) << "Failed to unwatch executor " << executorId
-               << " of framework " << frameworkId
-               << ": " << unwatch.isFailed() ? unwatch.failure() : "discarded";
+               << " of framework " << frameworkId << ": "
+               << (unwatch.isFailed() ? unwatch.failure() : "discarded");
   }
 }
 
@@ -2619,6 +2619,15 @@ Future<Nothing> Slave::_recover(const SlaveState& state, bool reconnect)
 {
   foreachvalue(Framework* framework, frameworks){
     foreachvalue(Executor* executor, framework->executors) {
+      // If the executor is already terminating/terminated don't
+      // bother reconnecting or killing it. This could happen if
+      // the recovered isolator sent a 'ExecutorTerminated' message
+      // before the slave is here.
+      if (executor->state == Executor::TERMINATING ||
+          executor->state == Executor::TERMINATED) {
+        continue;
+      }
+
       // Monitor the executor.
       monitor.watch(
           framework->id,

http://git-wip-us.apache.org/repos/asf/mesos/blob/649295f3/src/slave/status_update_manager.cpp
----------------------------------------------------------------------
diff --git a/src/slave/status_update_manager.cpp b/src/slave/status_update_manager.cpp
index ffd4736..6d4598e 100644
--- a/src/slave/status_update_manager.cpp
+++ b/src/slave/status_update_manager.cpp
@@ -297,8 +297,7 @@ Future<Nothing> StatusUpdateManagerProcess::_update(
   const TaskID& taskId = update.status().task_id();
   const FrameworkID& frameworkId = update.framework_id();
 
-  LOG(INFO) << "Received status update " << update
-            << " with checkpoint=" << stringify(checkpoint);
+  LOG(INFO) << "Received status update " << update;
 
   // Write the status update to disk and enqueue it to send it to the master.
   // Create/Get the status update stream for this task.
@@ -457,8 +456,8 @@ StatusUpdateStream* StatusUpdateManagerProcess::createStatusUpdateStream(
     const Option<ExecutorID>& executorId,
     const Option<UUID>& uuid)
 {
-  LOG(INFO) << "Creating StatusUpdate stream for task " << taskId
-            << " of framework " << frameworkId;
+  VLOG(1) << "Creating StatusUpdate stream for task " << taskId
+          << " of framework " << frameworkId;
 
   StatusUpdateStream* stream = new StatusUpdateStream(
       taskId, frameworkId, slaveId, flags, checkpoint, executorId, uuid);
@@ -488,9 +487,9 @@ void StatusUpdateManagerProcess::cleanupStatusUpdateStream(
     const TaskID& taskId,
     const FrameworkID& frameworkId)
 {
-  LOG(INFO) << "Cleaning up status update stream"
-            << " for task " << taskId
-            << " of framework " << frameworkId;
+  VLOG(1) << "Cleaning up status update stream"
+          << " for task " << taskId
+          << " of framework " << frameworkId;
 
   CHECK(streams.contains(frameworkId))
     << "Cannot find the status update streams for framework " << frameworkId;

http://git-wip-us.apache.org/repos/asf/mesos/blob/649295f3/src/slave/status_update_manager.hpp
----------------------------------------------------------------------
diff --git a/src/slave/status_update_manager.hpp b/src/slave/status_update_manager.hpp
index da92760..ffc79ae 100644
--- a/src/slave/status_update_manager.hpp
+++ b/src/slave/status_update_manager.hpp
@@ -285,7 +285,7 @@ struct StatusUpdateStream
       return Error(error.get());
     }
 
-    LOG(INFO) << "Replaying status update stream for task " << taskId;
+    VLOG(1) << "Replaying status update stream for task " << taskId;
 
     foreach (const StatusUpdate& update, updates) {
       // Handle the update.