You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by qi...@apache.org on 2018/02/14 12:49:21 UTC

[5/7] mesos git commit: Made the default executor treat agent disconnections more gracefully.

Made the default executor treat agent disconnections more gracefully.

This patch makes the default executor not shutdown if there are active
child containers, and it fails to connect or is not subscribed to the
agent when starting to launch a task group.

Review: https://reviews.apache.org/r/65556/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/54b6c5b9
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/54b6c5b9
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/54b6c5b9

Branch: refs/heads/master
Commit: 54b6c5b9c7cb059ebd87ee0f9927cfa6ff73129d
Parents: 656196e
Author: Gaston Kleiman <ga...@mesosphere.io>
Authored: Wed Feb 14 14:35:22 2018 +0800
Committer: Qian Zhang <zh...@gmail.com>
Committed: Wed Feb 14 20:37:31 2018 +0800

----------------------------------------------------------------------
 src/launcher/default_executor.cpp | 43 +++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/54b6c5b9/src/launcher/default_executor.cpp
----------------------------------------------------------------------
diff --git a/src/launcher/default_executor.cpp b/src/launcher/default_executor.cpp
index 16977b5..ff437bc 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -366,19 +366,33 @@ protected:
     }
 
     if (!connection.isReady()) {
-      LOG(ERROR)
-        << "Unable to establish connection with the agent: "
-        << (connection.isFailed() ? connection.failure() : "discarded");
-      _shutdown();
+      LOG(WARNING) << "Unable to establish connection with the agent to "
+                   << "complete the launch group operation: "
+                   << (connection.isFailed() ? connection.failure()
+                                             : "discarded");
+      dropTaskGroup(taskGroup);
+
+      // Shutdown the executor if all the active child containers have
+      // terminated.
+      if (containers.empty()) {
+        _shutdown();
+      }
+
       return;
     }
 
     // It is possible that the agent process failed after the connection was
-    // established. Shutdown the executor if this happens.
+    // established. Drop the task group if this happens.
     if (state == DISCONNECTED || state == CONNECTED) {
-      LOG(ERROR) << "Unable to complete the launch group operation "
-                 << "as the executor is in state " << state;
-      _shutdown();
+      LOG(WARNING) << "Unable to complete the launch group operation "
+                   << "as the executor is in state " << state;
+      dropTaskGroup(taskGroup);
+
+      // Shutdown the executor if all the active child containers have
+      // terminated.
+      if (containers.empty()) {
+        _shutdown();
+      }
       return;
     }
 
@@ -1515,6 +1529,19 @@ private:
         taskId);
   }
 
+  void dropTaskGroup(const TaskGroupInfo& taskGroup)
+  {
+    TaskState taskState =
+      protobuf::frameworkHasCapability(
+          frameworkInfo.get(), FrameworkInfo::Capability::PARTITION_AWARE)
+        ? TASK_DROPPED
+        : TASK_LOST;
+
+    foreach (const TaskInfo& task, taskGroup.tasks()) {
+      forward(createTaskStatus(task.task_id(), taskState));
+    }
+  }
+
   enum State
   {
     CONNECTED,