You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by jp...@apache.org on 2018/04/20 16:22:12 UTC

[3/5] mesos git commit: Handled failing to create the executor sandbox.

Handled failing to create the executor sandbox.

When the agents adds a new executor, creating the sandbox might
fail (most commonly because the requested task user is not present
on the agent). Rather than crashing the agent with a `CHECK`,
we report a `TASK_DROPPED` status update. This makes the behavior
more consistent with the nested containers API, which also reports
an error in this case.

Review: https://reviews.apache.org/r/66706/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/6e4440d8
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/6e4440d8
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/6e4440d8

Branch: refs/heads/master
Commit: 6e4440d8ea045877e51338b620182cecd471818f
Parents: 24773d4
Author: James Peach <jp...@apache.org>
Authored: Fri Apr 20 08:56:58 2018 -0700
Committer: James Peach <jp...@apache.org>
Committed: Fri Apr 20 08:56:58 2018 -0700

----------------------------------------------------------------------
 src/slave/slave.cpp | 94 ++++++++++++++++++++++++++++++++++--------------
 src/slave/slave.hpp |  2 +-
 2 files changed, 69 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/6e4440d8/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index fab31fd..455e3cc 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -2866,29 +2866,6 @@ void Slave::__run(
   LOG(INFO) << "Launching " << taskOrTaskGroup(task, taskGroup)
             << " for framework " << frameworkId;
 
-  auto doLaunchExecutor = [&]() {
-    Executor* executor = framework->addExecutor(executorInfo);
-
-    if (secretGenerator) {
-      generateSecret(framework->id(), executor->id, executor->containerId)
-        .onAny(defer(
-              self(),
-              &Self::launchExecutor,
-              lambda::_1,
-              frameworkId,
-              executorId,
-              taskGroup.isNone() ? task.get() : Option<TaskInfo>::none()));
-    } else {
-      Slave::launchExecutor(
-          None(),
-          frameworkId,
-          executorId,
-          taskGroup.isNone() ? task.get() : Option<TaskInfo>::none());
-    }
-
-    return executor;
-  };
-
   Executor* executor = framework->getExecutor(executorId);
 
   // If launchExecutor is NONE, this is the legacy case where the master
@@ -3016,7 +2993,70 @@ void Slave::__run(
   // or we are in the legacy case of launching one if there wasn't
   // one already. Either way, let's launch executor now.
   if (executor == nullptr) {
-    executor = doLaunchExecutor();
+    Try<Executor*> added = framework->addExecutor(executorInfo);
+
+    if (added.isError()) {
+      CHECK(framework->getExecutor(executorId) == nullptr);
+
+      mesos::TaskState taskState = TASK_DROPPED;
+      if (!protobuf::frameworkHasCapability(
+          frameworkInfo, FrameworkInfo::Capability::PARTITION_AWARE)) {
+        taskState = TASK_LOST;
+      }
+
+      foreach (const TaskInfo& _task, tasks) {
+        const StatusUpdate update = protobuf::createStatusUpdate(
+            frameworkId,
+            info.id(),
+            _task.task_id(),
+            taskState,
+            TaskStatus::SOURCE_SLAVE,
+            id::UUID::random(),
+            added.error(),
+            TaskStatus::REASON_EXECUTOR_TERMINATED,
+            executorId);
+
+        statusUpdate(update, UPID());
+      }
+
+      // Refer to the comment after 'framework->removePendingTask' above
+      // for why we need this.
+      if (framework->idle()) {
+        removeFramework(framework);
+      }
+
+      if (launchExecutor.isSome() && launchExecutor.get()) {
+        // Master expects a new executor to be launched for this task(s).
+        // To keep the master executor entries updated, the agent needs to send
+        // `ExitedExecutorMessage` even though no executor launched.
+        sendExitedExecutorMessage(frameworkId, executorInfo.executor_id());
+
+        // See the declaration of `taskLaunchSequences` regarding its lifecycle
+        // management.
+        framework->taskLaunchSequences.erase(executorInfo.executor_id());
+      }
+
+      return;
+    }
+
+    executor = added.get();
+
+    if (secretGenerator) {
+      generateSecret(framework->id(), executor->id, executor->containerId)
+        .onAny(defer(
+              self(),
+              &Self::launchExecutor,
+              lambda::_1,
+              frameworkId,
+              executorId,
+              taskGroup.isNone() ? task.get() : Option<TaskInfo>::none()));
+    } else {
+      Slave::launchExecutor(
+          None(),
+          frameworkId,
+          executorId,
+          taskGroup.isNone() ? task.get() : Option<TaskInfo>::none());
+    }
   }
 
   CHECK_NOTNULL(executor);
@@ -8855,7 +8895,7 @@ void Framework::checkpointFramework() const
 }
 
 
-Executor* Framework::addExecutor(const ExecutorInfo& executorInfo)
+Try<Executor*> Framework::addExecutor(const ExecutorInfo& executorInfo)
 {
   // Verify that Resource.AllocationInfo is set, if coming
   // from a MULTI_ROLE master this will be set, otherwise
@@ -8900,7 +8940,9 @@ Executor* Framework::addExecutor(const ExecutorInfo& executorInfo)
       containerId,
       user);
 
-  CHECK_SOME(directory);
+  if (directory.isError()) {
+    return Error(directory.error());
+  }
 
   Executor* executor = new Executor(
       slave,

http://git-wip-us.apache.org/repos/asf/mesos/blob/6e4440d8/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index d00c7b2..c35996b 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -1079,7 +1079,7 @@ public:
 
   const FrameworkID id() const { return info.id(); }
 
-  Executor* addExecutor(const ExecutorInfo& executorInfo);
+  Try<Executor*> addExecutor(const ExecutorInfo& executorInfo);
   Executor* getExecutor(const ExecutorID& executorId) const;
   Executor* getExecutor(const TaskID& taskId) const;