You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by jp...@apache.org on 2018/04/20 16:22:12 UTC
[3/5] mesos git commit: Handled failing to create the executor
sandbox.
Handled failing to create the executor sandbox.
When the agents adds a new executor, creating the sandbox might
fail (most commonly because the requested task user is not present
on the agent). Rather than crashing the agent with a `CHECK`,
we report a `TASK_DROPPED` status update. This makes the behavior
more consistent with the nested containers API, which also reports
an error in this case.
Review: https://reviews.apache.org/r/66706/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/6e4440d8
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/6e4440d8
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/6e4440d8
Branch: refs/heads/master
Commit: 6e4440d8ea045877e51338b620182cecd471818f
Parents: 24773d4
Author: James Peach <jp...@apache.org>
Authored: Fri Apr 20 08:56:58 2018 -0700
Committer: James Peach <jp...@apache.org>
Committed: Fri Apr 20 08:56:58 2018 -0700
----------------------------------------------------------------------
src/slave/slave.cpp | 94 ++++++++++++++++++++++++++++++++++--------------
src/slave/slave.hpp | 2 +-
2 files changed, 69 insertions(+), 27 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/6e4440d8/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index fab31fd..455e3cc 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -2866,29 +2866,6 @@ void Slave::__run(
LOG(INFO) << "Launching " << taskOrTaskGroup(task, taskGroup)
<< " for framework " << frameworkId;
- auto doLaunchExecutor = [&]() {
- Executor* executor = framework->addExecutor(executorInfo);
-
- if (secretGenerator) {
- generateSecret(framework->id(), executor->id, executor->containerId)
- .onAny(defer(
- self(),
- &Self::launchExecutor,
- lambda::_1,
- frameworkId,
- executorId,
- taskGroup.isNone() ? task.get() : Option<TaskInfo>::none()));
- } else {
- Slave::launchExecutor(
- None(),
- frameworkId,
- executorId,
- taskGroup.isNone() ? task.get() : Option<TaskInfo>::none());
- }
-
- return executor;
- };
-
Executor* executor = framework->getExecutor(executorId);
// If launchExecutor is NONE, this is the legacy case where the master
@@ -3016,7 +2993,70 @@ void Slave::__run(
// or we are in the legacy case of launching one if there wasn't
// one already. Either way, let's launch executor now.
if (executor == nullptr) {
- executor = doLaunchExecutor();
+ Try<Executor*> added = framework->addExecutor(executorInfo);
+
+ if (added.isError()) {
+ CHECK(framework->getExecutor(executorId) == nullptr);
+
+ mesos::TaskState taskState = TASK_DROPPED;
+ if (!protobuf::frameworkHasCapability(
+ frameworkInfo, FrameworkInfo::Capability::PARTITION_AWARE)) {
+ taskState = TASK_LOST;
+ }
+
+ foreach (const TaskInfo& _task, tasks) {
+ const StatusUpdate update = protobuf::createStatusUpdate(
+ frameworkId,
+ info.id(),
+ _task.task_id(),
+ taskState,
+ TaskStatus::SOURCE_SLAVE,
+ id::UUID::random(),
+ added.error(),
+ TaskStatus::REASON_EXECUTOR_TERMINATED,
+ executorId);
+
+ statusUpdate(update, UPID());
+ }
+
+ // Refer to the comment after 'framework->removePendingTask' above
+ // for why we need this.
+ if (framework->idle()) {
+ removeFramework(framework);
+ }
+
+ if (launchExecutor.isSome() && launchExecutor.get()) {
+ // Master expects a new executor to be launched for this task(s).
+ // To keep the master executor entries updated, the agent needs to send
+ // `ExitedExecutorMessage` even though no executor launched.
+ sendExitedExecutorMessage(frameworkId, executorInfo.executor_id());
+
+ // See the declaration of `taskLaunchSequences` regarding its lifecycle
+ // management.
+ framework->taskLaunchSequences.erase(executorInfo.executor_id());
+ }
+
+ return;
+ }
+
+ executor = added.get();
+
+ if (secretGenerator) {
+ generateSecret(framework->id(), executor->id, executor->containerId)
+ .onAny(defer(
+ self(),
+ &Self::launchExecutor,
+ lambda::_1,
+ frameworkId,
+ executorId,
+ taskGroup.isNone() ? task.get() : Option<TaskInfo>::none()));
+ } else {
+ Slave::launchExecutor(
+ None(),
+ frameworkId,
+ executorId,
+ taskGroup.isNone() ? task.get() : Option<TaskInfo>::none());
+ }
}
CHECK_NOTNULL(executor);
@@ -8855,7 +8895,7 @@ void Framework::checkpointFramework() const
}
-Executor* Framework::addExecutor(const ExecutorInfo& executorInfo)
+Try<Executor*> Framework::addExecutor(const ExecutorInfo& executorInfo)
{
// Verify that Resource.AllocationInfo is set, if coming
// from a MULTI_ROLE master this will be set, otherwise
@@ -8900,7 +8940,9 @@ Executor* Framework::addExecutor(const ExecutorInfo& executorInfo)
containerId,
user);
- CHECK_SOME(directory);
+ if (directory.isError()) {
+ return Error(directory.error());
+ }
Executor* executor = new Executor(
slave,
http://git-wip-us.apache.org/repos/asf/mesos/blob/6e4440d8/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index d00c7b2..c35996b 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -1079,7 +1079,7 @@ public:
const FrameworkID id() const { return info.id(); }
- Executor* addExecutor(const ExecutorInfo& executorInfo);
+ Try<Executor*> addExecutor(const ExecutorInfo& executorInfo);
Executor* getExecutor(const ExecutorID& executorId) const;
Executor* getExecutor(const TaskID& taskId) const;