You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by al...@apache.org on 2017/10/18 19:41:53 UTC

[3/6] mesos git commit: Send TASK_STARTING from the built-in executors.

Send TASK_STARTING from the built-in executors.

This gives schedulers more information about a tasks status,
in particular it gives a better estimate of a tasks start time
and helps differentiating between tasks stuck in TASK_STAGING
and tasks stuck in TASK_STARTING.

Review: https://reviews.apache.org/r/62212/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/de11f0ff
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/de11f0ff
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/de11f0ff

Branch: refs/heads/master
Commit: de11f0ffed36fbabc8fb4167859fd23b75f43f10
Parents: 4b6d848
Author: Benno Evers <be...@mesosphere.com>
Authored: Wed Oct 18 12:15:13 2017 -0700
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Wed Oct 18 12:15:13 2017 -0700

----------------------------------------------------------------------
 docs/high-availability-framework-guide.md |  9 +++--
 src/docker/executor.cpp                   |  8 ++++
 src/launcher/default_executor.cpp         | 52 +++++++++++++++++---------
 src/launcher/executor.cpp                 |  5 ++-
 4 files changed, 51 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/de11f0ff/docs/high-availability-framework-guide.md
----------------------------------------------------------------------
diff --git a/docs/high-availability-framework-guide.md b/docs/high-availability-framework-guide.md
index 73743ab..f246773 100644
--- a/docs/high-availability-framework-guide.md
+++ b/docs/high-availability-framework-guide.md
@@ -189,10 +189,11 @@ initial state and several possible terminal states:
   has not yet started to run. In this state, the task's dependencies are
   fetched---for example, using the [Mesos fetcher cache](fetcher.md).
 
-* The `TASK_STARTING` state is optional and intended primarily for use by
-  custom executors. It can be used to describe the fact that a custom executor
-  has learned about the task (and maybe started fetching its dependencies) but has
-  not yet started to run it.
+* The `TASK_STARTING` state is optional. It can be used to describe the fact
+  that an executor has learned about the task (and maybe started fetching its
+  dependencies) but has not yet started to run it. Custom executors are
+  encouraged to send it, to provide a more detailed description of the current
+  task state to outside observers.
 
 * A task transitions to the `TASK_RUNNING` state after it has begun running
   successfully (if the task fails to start, it transitions to one of the

http://git-wip-us.apache.org/repos/asf/mesos/blob/de11f0ff/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index 3b0767f..5f6a0d0 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -159,6 +159,14 @@ public:
 
     LOG(INFO) << "Starting task " << taskId.get();
 
+    // Send initial TASK_STARTING update.
+    // TODO(alexr): Use `protobuf::createTaskStatus()`
+    // instead of manually setting fields.
+    TaskStatus starting;
+    starting.mutable_task_id()->CopyFrom(task.task_id());
+    starting.set_state(TASK_STARTING);
+    driver->sendStatusUpdate(starting);
+
     CHECK(task.has_container());
     CHECK(task.has_command());
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/de11f0ff/src/launcher/default_executor.cpp
----------------------------------------------------------------------
diff --git a/src/launcher/default_executor.cpp b/src/launcher/default_executor.cpp
index e58766f..cdb3c3e 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -108,6 +108,12 @@ private:
     // `WAIT_NESTED_CONTAINER` call has not been established yet.
     Option<Connection> waiting;
 
+    // TODO(bennoe): Create a real state machine instead of adding
+    // more and more ad-hoc boolean values.
+
+    // Indicates whether a container has been launched.
+    bool launched;
+
     // Indicates whether a status update acknowledgement
     // has been received for any status update.
     bool acknowledged;
@@ -318,12 +324,14 @@ protected:
       subscribe->add_unacknowledged_updates()->MergeFrom(update);
     }
 
-    // Send all unacknowledged tasks. We don't send unacknowledged terminated
-    // (and hence already removed from `containers`) tasks, because for such
-    // tasks `WAIT_NESTED_CONTAINER` call has already succeeded, meaning the
-    // agent knows about the tasks and corresponding containers.
+    // Send all unacknowledged tasks. We don't send tasks whose container
+    // didn't launch yet, because the agent will learn about once it launches.
+    // We also don't send unacknowledged terminated (and hence already removed
+    // from `containers`) tasks, because for such tasks `WAIT_NESTED_CONTAINER`
+    // call has already succeeded, meaning the agent knows about the tasks and
+    // corresponding containers.
     foreachvalue (const Owned<Container>& container, containers) {
-      if (!container->acknowledged) {
+      if (container->launched && !container->acknowledged) {
         subscribe->add_unacknowledged_tasks()->MergeFrom(container->taskInfo);
       }
     }
@@ -403,6 +411,23 @@ protected:
 
       containerIds.push_back(containerId);
 
+      containers[task.task_id()] = Owned<Container>(new Container{
+        containerId,
+        task,
+        taskGroup,
+        None(),
+        None(),
+        None(),
+        None(),
+        false,
+        false,
+        false,
+        false});
+
+      // Send out the initial TASK_STARTING update.
+      const TaskStatus status = createTaskStatus(task.task_id(), TASK_STARTING);
+      forward(status);
+
       agent::Call call;
       call.set_type(agent::Call::LAUNCH_NESTED_CONTAINER);
 
@@ -526,17 +551,8 @@ protected:
       const TaskInfo& task = taskGroup.tasks().Get(index++);
       const TaskID& taskId = task.task_id();
 
-      containers[taskId] = Owned<Container>(new Container{
-        containerId,
-        task,
-        taskGroup,
-        None(),
-        None(),
-        None(),
-        None(),
-        false,
-        false,
-        false});
+      CHECK(containers.contains(taskId));
+      containers.at(taskId)->launched = true;
 
       if (task.has_check()) {
         Try<Owned<checks::Checker>> checker =
@@ -1410,7 +1426,7 @@ private:
 
     CHECK_EQ(SUBSCRIBED, state);
     CHECK_SOME(connectionId);
-    CHECK(containers.contains(taskId));
+    CHECK(containers.contains(taskId) && containers.at(taskId)->launched);
 
     const Owned<Container>& container = containers.at(taskId);
 
@@ -1469,7 +1485,7 @@ private:
 
   LinkedHashMap<UUID, Call::Update> unacknowledgedUpdates;
 
-  // Active child containers.
+  // Child containers.
   LinkedHashMap<TaskID, Owned<Container>> containers;
 
   // There can be multiple simulataneous ongoing (re-)connection attempts

http://git-wip-us.apache.org/repos/asf/mesos/blob/de11f0ff/src/launcher/executor.cpp
----------------------------------------------------------------------
diff --git a/src/launcher/executor.cpp b/src/launcher/executor.cpp
index 0131577..34f6f7a 100644
--- a/src/launcher/executor.cpp
+++ b/src/launcher/executor.cpp
@@ -525,6 +525,10 @@ protected:
     taskData = TaskData(task);
     taskId = task.task_id();
 
+    // Send initial TASK_STARTING update.
+    TaskStatus starting = createTaskStatus(taskId.get(), TASK_STARTING);
+    forward(starting);
+
     // Capture the kill policy.
     if (task.has_kill_policy()) {
       killPolicy = task.kill_policy();
@@ -1016,7 +1020,6 @@ private:
     // If a check for the task has been defined, `check_status` field in each
     // task status must be set to a valid `CheckStatusInfo` message even if
     // there is no check status available yet.
-    CHECK(taskData.isSome());
     if (taskData->taskInfo.has_check()) {
       CheckStatusInfo checkStatusInfo;
       checkStatusInfo.set_type(taskData->taskInfo.check().type());