You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2017/08/24 20:11:07 UTC

[1/8] mesos git commit: Added a missing return statement from 45c2444ad.

Repository: mesos
Updated Branches:
  refs/heads/1.2.x f552d6dfe -> 928751aa3


Added a missing return statement from 45c2444ad.

This commit missed a return statement, which leads to a misleading
log message that the task cannot be killed due to the executor not
being found.


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/ae6753d4
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/ae6753d4
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/ae6753d4

Branch: refs/heads/1.2.x
Commit: ae6753d421d60f137ffe43f2a7f097029a9f79f2
Parents: ccdbf88
Author: Benjamin Mahler <bm...@apache.org>
Authored: Wed Aug 23 17:14:42 2017 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Thu Aug 24 13:10:29 2017 -0700

----------------------------------------------------------------------
 src/slave/slave.cpp | 2 ++
 1 file changed, 2 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/ae6753d4/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 36c380e..5cb075c 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -2502,6 +2502,8 @@ void Slave::killTask(
       // launched.
       statusUpdate(update, UPID());
     }
+
+    return;
   }
 
   Executor* executor = framework->getExecutor(taskId);


[6/8] mesos git commit: Improved the reason and message for tasks killed before delivery.

Posted by bm...@apache.org.
Improved the reason and message for tasks killed before delivery.

Previously, if a task was killed before delivery to a registering
executor, we sent REASON_EXECUTOR_UNREGISTERED with a message of
"Unregistered executor" in the agent, and the master sent no reason.
This introduces a REASON_TASK_KILLED_BEFORE_DELIVERY to clarify
this case to frameworks.

Review: https://reviews.apache.org/r/61640


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/0276d38f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/0276d38f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/0276d38f

Branch: refs/heads/1.2.x
Commit: 0276d38fc385165d402a30a14ce7b8f23c1ea818
Parents: f552d6d
Author: Benjamin Mahler <bm...@apache.org>
Authored: Fri Aug 11 13:38:08 2017 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Thu Aug 24 13:10:29 2017 -0700

----------------------------------------------------------------------
 include/mesos/mesos.proto                |  3 ++-
 include/mesos/v1/mesos.proto             |  3 ++-
 src/master/master.cpp                    |  6 ++++--
 src/slave/slave.cpp                      | 22 ++++++++++++----------
 src/tests/master_authorization_tests.cpp |  9 +++++++--
 src/tests/slave_recovery_tests.cpp       | 11 ++++++-----
 src/tests/slave_tests.cpp                | 10 ++++++----
 7 files changed, 39 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/0276d38f/include/mesos/mesos.proto
----------------------------------------------------------------------
diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto
index 5cab60c..ae2de6b 100644
--- a/include/mesos/mesos.proto
+++ b/include/mesos/mesos.proto
@@ -1799,7 +1799,7 @@ message TaskStatus {
     REASON_EXECUTOR_REGISTRATION_TIMEOUT = 23;
     REASON_EXECUTOR_REREGISTRATION_TIMEOUT = 24;
     REASON_EXECUTOR_TERMINATED = 1;
-    REASON_EXECUTOR_UNREGISTERED = 2;
+    REASON_EXECUTOR_UNREGISTERED = 2; // No longer used.
     REASON_FRAMEWORK_REMOVED = 3;
     REASON_GC_ERROR = 4;
     REASON_INVALID_FRAMEWORKID = 5;
@@ -1812,6 +1812,7 @@ message TaskStatus {
     REASON_SLAVE_REMOVED = 11;
     REASON_SLAVE_RESTARTED = 12;
     REASON_SLAVE_UNKNOWN = 13;
+    REASON_TASK_KILLED_DURING_LAUNCH = 30;
     REASON_TASK_CHECK_STATUS_UPDATED = 28;
     REASON_TASK_GROUP_INVALID = 25;
     REASON_TASK_GROUP_UNAUTHORIZED = 26;

http://git-wip-us.apache.org/repos/asf/mesos/blob/0276d38f/include/mesos/v1/mesos.proto
----------------------------------------------------------------------
diff --git a/include/mesos/v1/mesos.proto b/include/mesos/v1/mesos.proto
index 4a52093..c0ad505 100644
--- a/include/mesos/v1/mesos.proto
+++ b/include/mesos/v1/mesos.proto
@@ -1798,7 +1798,7 @@ message TaskStatus {
     REASON_EXECUTOR_REGISTRATION_TIMEOUT = 23;
     REASON_EXECUTOR_REREGISTRATION_TIMEOUT = 24;
     REASON_EXECUTOR_TERMINATED = 1;
-    REASON_EXECUTOR_UNREGISTERED = 2;
+    REASON_EXECUTOR_UNREGISTERED = 2; // No longer used.
     REASON_FRAMEWORK_REMOVED = 3;
     REASON_GC_ERROR = 4;
     REASON_INVALID_FRAMEWORKID = 5;
@@ -1811,6 +1811,7 @@ message TaskStatus {
     REASON_AGENT_REMOVED = 11;
     REASON_AGENT_RESTARTED = 12;
     REASON_AGENT_UNKNOWN = 13;
+    REASON_TASK_KILLED_DURING_LAUNCH = 30;
     REASON_TASK_CHECK_STATUS_UPDATED = 28;
     REASON_TASK_GROUP_INVALID = 25;
     REASON_TASK_GROUP_UNAUTHORIZED = 26;

http://git-wip-us.apache.org/repos/asf/mesos/blob/0276d38f/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 4742d21..8803a87 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -4581,7 +4581,8 @@ void Master::_accept(
                   TaskStatus::SOURCE_MASTER,
                   None(),
                   "A task within the task group was killed before"
-                  " delivery to the agent");
+                  " delivery to the agent",
+                  TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH);
 
               metrics->tasks_killed++;
 
@@ -4935,7 +4936,8 @@ void Master::kill(Framework* framework, const scheduler::Call::Kill& kill)
         TASK_KILLED,
         TaskStatus::SOURCE_MASTER,
         None(),
-        "Killed pending task");
+        "Killed before delivery to the agent",
+        TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH);
 
     forward(update, UPID(), framework);
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/0276d38f/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 100ff0c..db60b93 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -1840,7 +1840,9 @@ void Slave::_run(
           TASK_KILLED,
           TaskStatus::SOURCE_SLAVE,
           UUID::random(),
-          "Task killed before it was launched");
+          "Killed before delivery to the executor",
+          TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH);
+
       statusUpdate(update, UPID());
     }
 
@@ -1901,7 +1903,6 @@ void Slave::_run(
       statusUpdate(update, UPID());
     }
 
-    // Refer to the comment after 'framework->pending.erase' above
     // for why we need this.
     if (framework->executors.empty() && framework->pending.empty()) {
       removeFramework(framework);
@@ -2537,8 +2538,9 @@ void Slave::killTask(
               TASK_KILLED,
               TaskStatus::SOURCE_SLAVE,
               UUID::random(),
-              "Unregistered executor",
-              TaskStatus::REASON_EXECUTOR_UNREGISTERED,
+              "A task within the task group was killed before"
+              " delivery to the executor",
+              TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
               executor->id));
         }
       } else {
@@ -2549,8 +2551,8 @@ void Slave::killTask(
             TASK_KILLED,
             TaskStatus::SOURCE_SLAVE,
             UUID::random(),
-            "Unregistered executor",
-            TaskStatus::REASON_EXECUTOR_UNREGISTERED,
+            "Killed before delivery to the executor",
+            TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
             executor->id));
       }
 
@@ -2594,8 +2596,8 @@ void Slave::killTask(
                 TASK_KILLED,
                 TaskStatus::SOURCE_SLAVE,
                 UUID::random(),
-                "Task killed while it was queued",
-                None(),
+                "Killed before delivery to the executor",
+                TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
                 executor->id));
           }
         } else {
@@ -2606,8 +2608,8 @@ void Slave::killTask(
               TASK_KILLED,
               TaskStatus::SOURCE_SLAVE,
               UUID::random(),
-              "Task killed while it was queued",
-              None(),
+              "Killed before delivery to the executor",
+              TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
               executor->id));
         }
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/0276d38f/src/tests/master_authorization_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_authorization_tests.cpp b/src/tests/master_authorization_tests.cpp
index f973cf4..5bfc0ab 100644
--- a/src/tests/master_authorization_tests.cpp
+++ b/src/tests/master_authorization_tests.cpp
@@ -408,7 +408,8 @@ TEST_F(MasterAuthorizationTest, KillTask)
 
   // Framework should get a TASK_KILLED right away.
   AWAIT_READY(status);
-  EXPECT_EQ(TASK_KILLED, status.get().state());
+  EXPECT_EQ(TASK_KILLED, status->state());
+  EXPECT_EQ(TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH, status->reason());
 
   Future<Nothing> recoverResources =
     FUTURE_DISPATCH(_, &MesosAllocatorProcess::recoverResources);
@@ -531,7 +532,9 @@ TEST_F(MasterAuthorizationTest, KillPendingTaskInTaskGroup)
   AWAIT_READY(task1Status);
   EXPECT_EQ(TASK_KILLED, task1Status->state());
   EXPECT_TRUE(strings::contains(
-      task1Status->message(), "Killed pending task"));
+      task1Status->message(), "Killed before delivery to the agent"));
+  EXPECT_EQ(TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
+            task1Status->reason());
 
   Future<Nothing> recoverResources =
     FUTURE_DISPATCH(_, &MesosAllocatorProcess::recoverResources);
@@ -545,6 +548,8 @@ TEST_F(MasterAuthorizationTest, KillPendingTaskInTaskGroup)
   EXPECT_TRUE(strings::contains(
       task2Status->message(),
       "A task within the task group was killed before delivery to the agent"));
+  EXPECT_EQ(TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
+            task2Status->reason());
 
   // No task launch should happen resulting in all resources being
   // returned to the allocator.

http://git-wip-us.apache.org/repos/asf/mesos/blob/0276d38f/src/tests/slave_recovery_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_recovery_tests.cpp b/src/tests/slave_recovery_tests.cpp
index 2f5338b..3075002 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -1033,10 +1033,11 @@ TYPED_TEST(SlaveRecoveryTest, RecoverUnregisteredExecutor)
 }
 
 
-// This test verifies that when the agent gets a `killTask` message and restarts
-// before the executor registers, a TASK_KILLED update is sent and the executor
-// shuts down.
-TYPED_TEST(SlaveRecoveryTest, KillTaskUnregisteredExecutor)
+// This test verifies that when the agent gets a `killTask`
+// message for a queued task on a registering executor, a
+// restart of the agent will generate a TASK_KILLED and
+// will shut down the executor.
+TYPED_TEST(SlaveRecoveryTest, KillQueuedTaskDuringExecutorRegistration)
 {
   Try<Owned<cluster::Master>> master = this->StartMaster();
   ASSERT_SOME(master);
@@ -1096,7 +1097,7 @@ TYPED_TEST(SlaveRecoveryTest, KillTaskUnregisteredExecutor)
 
   AWAIT_READY(status);
   EXPECT_EQ(TASK_KILLED, status->state());
-  EXPECT_EQ(TaskStatus::REASON_EXECUTOR_UNREGISTERED, status->reason());
+  EXPECT_EQ(TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH, status->reason());
 
   slave.get()->terminate();
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/0276d38f/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index c84cce3..635fb97 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -3409,9 +3409,11 @@ TEST_F(SlaveTest, KillTaskBetweenRunTaskParts)
 }
 
 
-// This test ensures that if a `killTask()` for an executor is received by the
-// agent before the executor registers, the executor is properly cleaned up.
-TEST_F(SlaveTest, KillTaskUnregisteredExecutor)
+// This test verifies that when the agent gets a `killTask`
+// message for a queued task on a registering executor, a
+// the agent will generate a TASK_KILLED and will shut down
+// the executor.
+TEST_F(SlaveTest, KillQueuedTaskDuringExecutorRegistration)
 {
   // Start a master.
   Try<Owned<cluster::Master>> master = StartMaster();
@@ -3479,7 +3481,7 @@ TEST_F(SlaveTest, KillTaskUnregisteredExecutor)
 
   AWAIT_READY(status);
   EXPECT_EQ(TASK_KILLED, status->state());
-  EXPECT_EQ(TaskStatus::REASON_EXECUTOR_UNREGISTERED, status->reason());
+  EXPECT_EQ(TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH, status->reason());
 
   // Now let the executor register by spoofing the message.
   RegisterExecutorMessage registerExecutor;


[2/8] mesos git commit: Fixed a bug where TASK_KILLED updates can be dropped by the agent.

Posted by bm...@apache.org.
Fixed a bug where TASK_KILLED updates can be dropped by the agent.

Per the description of MESOS-7863, there is currently an assumption
that when a pending task is killed, the framework will be stored in
the agent when the launch proceeds for the killed task. When this
assumption does not hold, the TASK_KILLED update will be dropped
due to the frameowrk being unknown when the launch proceeds. This
assumption doesn't hold in two cases:

  (1) Another pending task was killed and we removed the framework
      in 'Slave::run' thinking it was idle, because pending tasks
      was empty (we remove from pending tasks when processing the
      kill). (MESOS-7783 is an example instance of this).

  (2) The last executor terminated without tasks to send terminal
      updates for, or the last terminated executor received its
      last acknowledgement. At this point, we remove the framework
      thinking there were no pending tasks if the task was killed
      (removed from pending).

The fix applied here is to send the status updates from the kill
path rather than the launch path, to be consistent with how we kill
tasks queued within the Executor struct. We ensure that the task
is removed synchronously within the kill path to prevent its launch.

Review: https://reviews.apache.org/r/61645


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/ccdbf887
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/ccdbf887
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/ccdbf887

Branch: refs/heads/1.2.x
Commit: ccdbf88755f6add61cf358a2cc18d6783f4f5b47
Parents: 1d56db0
Author: Benjamin Mahler <bm...@apache.org>
Authored: Mon Aug 14 14:59:08 2017 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Thu Aug 24 13:10:29 2017 -0700

----------------------------------------------------------------------
 src/slave/slave.cpp       | 252 ++++++++++++++++++++++++++++++++---------
 src/slave/slave.hpp       |  23 +++-
 src/tests/slave_tests.cpp |  33 +++---
 3 files changed, 237 insertions(+), 71 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/ccdbf887/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 0a57be5..36c380e 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -1727,12 +1727,18 @@ void Slave::run(
     }
   }
 
-  // We add the task/task group to 'pending' to ensure the framework is not
-  // removed and the framework and top level executor directories
-  // are not scheduled for deletion before '_run()' is called.
   CHECK_NOTNULL(framework);
-  foreach (const TaskInfo& _task, tasks) {
-    framework->pendingTasks[executorId][_task.task_id()] = _task;
+
+  // Track the pending task / task group to ensure the framework is
+  // not removed and the framework and top level executor directories
+  // are not scheduled for deletion before '_run()' is called.
+  //
+  // TODO(bmahler): Can we instead track pending tasks within the
+  // `Executor` struct by creating it earlier?
+  if (task.isSome()) {
+    framework->addPendingTask(executorId, task.get());
+  } else {
+    framework->addPendingTaskGroup(executorId, taskGroup.get());
   }
 
   // If we are about to create a new executor, unschedule the top
@@ -1799,34 +1805,17 @@ void Slave::_run(
     return;
   }
 
-  const ExecutorID& executorId = executorInfo.executor_id();
-
-  // If any of the tasks in the task group have been killed in the interim,
-  // we send a TASK_KILLED for all the other tasks in the group.
-  bool killed = false;
-  foreach (const TaskInfo& _task, tasks) {
-    if (!framework->removePendingTask(_task.task_id())) {
-      killed = true;
-    }
-  }
-
-  if (killed) {
+  // We don't send a status update here because a terminating
+  // framework cannot send acknowledgements.
+  if (framework->state == Framework::TERMINATING) {
     LOG(WARNING) << "Ignoring running " << taskOrTaskGroup(task, taskGroup)
                  << " of framework " << frameworkId
-                 << " because it has been killed in the meantime";
+                 << " because the framework is terminating";
 
+    // Although we cannot send a status update in this case, we remove
+    // the affected tasks from the pending tasks.
     foreach (const TaskInfo& _task, tasks) {
-      const StatusUpdate update = protobuf::createStatusUpdate(
-          frameworkId,
-          info.id(),
-          _task.task_id(),
-          TASK_KILLED,
-          TaskStatus::SOURCE_SLAVE,
-          UUID::random(),
-          "Killed before delivery to the executor",
-          TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH);
-
-      statusUpdate(update, UPID());
+      framework->removePendingTask(_task.task_id());
     }
 
     if (framework->idle()) {
@@ -1836,22 +1825,34 @@ void Slave::_run(
     return;
   }
 
-  // We don't send a status update here because a terminating
-  // framework cannot send acknowledgements.
-  if (framework->state == Framework::TERMINATING) {
-    LOG(WARNING) << "Ignoring running " << taskOrTaskGroup(task, taskGroup)
-                 << " of framework " << frameworkId
-                 << " because the framework is terminating";
-
-    // Refer to the comment after 'framework->pending.erase' above
-    // for why we need this.
-    if (framework->idle()) {
-      removeFramework(framework);
+  // Ignore the launch if killed in the interim. The invariant here
+  // is that all tasks in the group are still pending, or all were
+  // removed due to a kill arriving for one of the tasks in the group.
+  bool allPending = true;
+  bool allRemoved = true;
+  foreach (const TaskInfo& _task, tasks) {
+    if (framework->isPending(_task.task_id())) {
+      allRemoved = false;
+    } else {
+      allPending = false;
     }
+  }
 
+  CHECK(allPending != allRemoved)
+    << "BUG: The task group " << taskOrTaskGroup(task, taskGroup)
+    << " was killed partially";
+
+  if (allRemoved) {
+    LOG(WARNING) << "Ignoring running " << taskOrTaskGroup(task, taskGroup)
+                 << " of framework " << frameworkId
+                 << " because it has been killed in the meantime";
     return;
   }
 
+  foreach (const TaskInfo& _task, tasks) {
+    CHECK(framework->removePendingTask(_task.task_id()));
+  }
+
   if (!future.isReady()) {
     LOG(ERROR) << "Failed to unschedule directories scheduled for gc: "
                << (future.isFailed() ? future.failure() : "future discarded");
@@ -1884,7 +1885,6 @@ void Slave::_run(
       statusUpdate(update, UPID());
     }
 
-    // for why we need this.
     if (framework->idle()) {
       removeFramework(framework);
     }
@@ -1960,6 +1960,8 @@ void Slave::_run(
 
   CHECK_EQ(kill, false);
 
+  const ExecutorID& executorId = executorInfo.executor_id();
+
   // Refer to the comment above when looping across tasks on
   // why we need to unallocate resources.
   Resources checkpointedExecutorResources =
@@ -2452,20 +2454,54 @@ void Slave::killTask(
     return;
   }
 
-  // TODO(bmahler): Removing the task here is a bug: MESOS-7783.
-  bool removedWhilePending = framework->removePendingTask(taskId);
-
-  if (removedWhilePending) {
+  // If the task is pending, we send a TASK_KILLED immediately.
+  // This will trigger a synchronous removal of the pending task,
+  // which prevents it from being launched.
+  if (framework->isPending(taskId)) {
     LOG(WARNING) << "Killing task " << taskId
                  << " of framework " << frameworkId
                  << " before it was launched";
 
-    // We send the TASK_KILLED status update in `_run()` as the
-    // task being killed could be part of a task group and we
-    // don't store this information in `framework->pendingTasks`.
-    // We don't invoke `removeFramework()` here since we need the
-    // framework to be valid for sending the status update later.
-    return;
+    Option<TaskGroupInfo> taskGroup =
+      framework->getTaskGroupForPendingTask(taskId);
+
+    list<StatusUpdate> updates;
+    if (taskGroup.isSome()) {
+      foreach (const TaskInfo& task, taskGroup->tasks()) {
+        updates.push_back(protobuf::createStatusUpdate(
+            frameworkId,
+            info.id(),
+            task.task_id(),
+            TASK_KILLED,
+            TaskStatus::SOURCE_SLAVE,
+            UUID::random(),
+            "A task within the task group was killed before"
+            " delivery to the executor",
+            TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
+            CHECK_NOTNONE(
+                framework->getExecutorIdForPendingTask(task.task_id()))));
+      }
+    } else {
+      updates.push_back(protobuf::createStatusUpdate(
+          frameworkId,
+          info.id(),
+          taskId,
+          TASK_KILLED,
+          TaskStatus::SOURCE_SLAVE,
+          UUID::random(),
+          "Killed before delivery to the executor",
+          TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
+          CHECK_NOTNONE(
+              framework->getExecutorIdForPendingTask(taskId))));
+    }
+
+    foreach (const StatusUpdate& update, updates) {
+      // NOTE: Sending a terminal update (TASK_KILLED) synchronously
+      // removes the task/task group from 'framework->pendingTasks'
+      // and 'framework->pendingTaskGroups', so that it will not be
+      // launched.
+      statusUpdate(update, UPID());
+    }
   }
 
   Executor* executor = framework->getExecutor(taskId);
@@ -3936,6 +3972,25 @@ void Slave::statusUpdate(StatusUpdate update, const Option<UPID>& pid)
 
   const TaskStatus& status = update.status();
 
+  // For pending tasks, we must synchronously remove them
+  // to guarantee that the launch is prevented.
+  //
+  // TODO(bmahler): Ideally we store this task as terminated
+  // but with unacknowledged updates (same as the `Executor`
+  // struct does).
+  if (framework->isPending(status.task_id())) {
+    CHECK(framework->removePendingTask(status.task_id()));
+
+    if (framework->idle()) {
+      removeFramework(framework);
+    }
+
+    metrics.valid_status_updates++;
+
+    statusUpdateManager->update(update, info.id())
+      .onAny(defer(self(), &Slave::___statusUpdate, lambda::_1, update, pid));
+  }
+
   Executor* executor = framework->getExecutor(status.task_id());
   if (executor == nullptr) {
     LOG(WARNING)  << "Could not find the executor for "
@@ -6906,19 +6961,110 @@ void Framework::recoverExecutor(const ExecutorState& state)
 }
 
 
+void Framework::addPendingTask(
+    const ExecutorID& executorId,
+    const TaskInfo& task)
+{
+  pendingTasks[executorId][task.task_id()] = task;
+}
+
+
+void Framework::addPendingTaskGroup(
+    const ExecutorID& executorId,
+    const TaskGroupInfo& taskGroup)
+{
+  foreach (const TaskInfo& task, taskGroup.tasks()) {
+    pendingTasks[executorId][task.task_id()] = task;
+  }
+
+  pendingTaskGroups.push_back(taskGroup);
+}
+
+
+bool Framework::isPending(const TaskID& taskId) const
+{
+  foreachkey (const ExecutorID& executorId, pendingTasks) {
+    if (pendingTasks.at(executorId).contains(taskId)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+Option<TaskGroupInfo> Framework::getTaskGroupForPendingTask(
+    const TaskID& taskId)
+{
+  foreach (const TaskGroupInfo& taskGroup, pendingTaskGroups) {
+    foreach (const TaskInfo& taskInfo, taskGroup.tasks()) {
+      if (taskInfo.task_id() == taskId) {
+        return taskGroup;
+      }
+    }
+  }
+
+  return None();
+}
+
+
 bool Framework::removePendingTask(const TaskID& taskId)
 {
+  bool removed = false;
+
   foreachkey (const ExecutorID& executorId, pendingTasks) {
     if (pendingTasks.at(executorId).contains(taskId)) {
       pendingTasks.at(executorId).erase(taskId);
       if (pendingTasks.at(executorId).empty()) {
         pendingTasks.erase(executorId);
       }
-      return true;
+
+      removed = true;
+      break;
     }
   }
 
-  return false;
+  // We also remove the pending task group if all of its
+  // tasks have been removed.
+  for (auto it = pendingTaskGroups.begin();
+       it != pendingTaskGroups.end();
+       ++it) {
+    foreach (const TaskInfo& t, it->tasks()) {
+      if (t.task_id() == taskId) {
+        // Found its task group, check if all tasks within
+        // the group have been removed.
+        bool allRemoved = true;
+
+        foreach (const TaskInfo& t_, it->tasks()) {
+          if (hasTask(t_.task_id())) {
+            allRemoved = false;
+            break;
+          }
+        }
+
+        if (allRemoved) {
+          pendingTaskGroups.erase(it);
+        }
+
+        return removed;
+      }
+    }
+  }
+
+  return removed;
+}
+
+
+Option<ExecutorID> Framework::getExecutorIdForPendingTask(
+    const TaskID& taskId) const
+{
+  foreachkey (const ExecutorID& executorId, pendingTasks) {
+    if (pendingTasks.at(executorId).contains(taskId)) {
+      return executorId;
+    }
+  }
+
+  return None();
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/ccdbf887/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index 1c4f625..11dd272 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -1047,6 +1047,8 @@ struct Framework
       const ExecutorInfo& executorInfo,
       const Option<TaskInfo>& taskInfo);
 
+  const FrameworkID id() const { return info.id(); }
+
   // Returns whether the framework is idle, where idle is
   // defined as having no activity:
   //   (1) The framework has no non-terminal tasks and executors.
@@ -1062,10 +1064,24 @@ struct Framework
   void recoverExecutor(const state::ExecutorState& state);
   void checkpointFramework() const;
 
+  void addPendingTask(
+      const ExecutorID& executorId,
+      const TaskInfo& task);
+
+  // Note that these tasks will also be tracked within `pendingTasks`.
+  void addPendingTaskGroup(
+      const ExecutorID& executorId,
+      const TaskGroupInfo& taskGroup);
+
+  bool isPending(const TaskID& taskId) const;
+
+  // Returns the task group associated with a pending task.
+  Option<TaskGroupInfo> getTaskGroupForPendingTask(const TaskID& taskId);
+
   // Returns whether the pending task was removed.
   bool removePendingTask(const TaskID& taskId);
 
-  const FrameworkID id() const { return info.id(); }
+  Option<ExecutorID> getExecutorIdForPendingTask(const TaskID& taskId) const;
 
   enum State
   {
@@ -1099,6 +1115,11 @@ struct Framework
   // Executors with pending tasks.
   hashmap<ExecutorID, hashmap<TaskID, TaskInfo>> pendingTasks;
 
+  // Pending task groups. This is needed for correctly sending
+  // TASK_KILLED status updates for all tasks in the group if
+  // any of the tasks are killed while pending.
+  std::list<TaskGroupInfo> pendingTaskGroups;
+
   // Current running executors.
   hashmap<ExecutorID, Executor*> executors;
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/ccdbf887/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 635fb97..2ffdca4 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -3381,23 +3381,22 @@ TEST_F(SlaveTest, KillTaskBetweenRunTaskParts)
     .WillOnce(DoAll(Invoke(&slave, &MockSlave::unmocked_killTask),
                     FutureSatisfy(&killTask)));
 
-  driver.killTask(task.task_id());
-
-  AWAIT_READY(killTask);
-
-  // Since this is the only task ever for this framework, the
-  // framework should get removed in Slave::_run().
-  // Thus we can observe that this happens before Shutdown().
   Future<Nothing> removeFramework;
   EXPECT_CALL(slave, removeFramework(_))
     .WillOnce(DoAll(Invoke(&slave, &MockSlave::unmocked_removeFramework),
                     FutureSatisfy(&removeFramework)));
 
-  slave.unmocked__run(
-      future, frameworkInfo, executorInfo, task_, taskGroup);
+  driver.killTask(task.task_id());
 
+  AWAIT_READY(killTask);
+
+  // The agent will remove the framework when killing this task
+  // since there remain no more tasks.
   AWAIT_READY(removeFramework);
 
+  slave.unmocked__run(
+      future, frameworkInfo, executorInfo, task_, taskGroup);
+
   AWAIT_READY(status);
   EXPECT_EQ(TASK_KILLED, status.get().state());
 
@@ -5244,6 +5243,13 @@ TEST_F(SlaveTest, KillTaskGroupBetweenRunTaskParts)
     .WillOnce(DoAll(Invoke(&slave, &MockSlave::unmocked_killTask),
                     FutureSatisfy(&killTask)));
 
+  // Since this is the only task group for this framework, the
+  // framework should get removed when the task is killed.
+  Future<Nothing> removeFramework;
+  EXPECT_CALL(slave, removeFramework(_))
+    .WillOnce(DoAll(Invoke(&slave, &MockSlave::unmocked_removeFramework),
+                    FutureSatisfy(&removeFramework)));
+
   {
     Call call;
     call.mutable_framework_id()->CopyFrom(frameworkId);
@@ -5258,18 +5264,11 @@ TEST_F(SlaveTest, KillTaskGroupBetweenRunTaskParts)
 
   AWAIT_READY(killTask);
 
-  // Since this is the only task group for this framework, the
-  // framework should get removed in `Slave::_run()`.
-  Future<Nothing> removeFramework;
-  EXPECT_CALL(slave, removeFramework(_))
-    .WillOnce(DoAll(Invoke(&slave, &MockSlave::unmocked_removeFramework),
-                    FutureSatisfy(&removeFramework)));
+  AWAIT_READY(removeFramework);
 
   slave.unmocked__run(
       future, frameworkInfo, executorInfo_, task_, taskGroup_);
 
-  AWAIT_READY(removeFramework);
-
   AWAIT_READY(update1);
   AWAIT_READY(update2);
 


[5/8] mesos git commit: Introduced a CHECK_NOTNONE.

Posted by bm...@apache.org.
Introduced a CHECK_NOTNONE.

This is similar to glog's CHECK_NOTNULL.

Review: https://reviews.apache.org/r/61644


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/1d56db0e
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/1d56db0e
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/1d56db0e

Branch: refs/heads/1.2.x
Commit: 1d56db0e85e471de5144406c159b84679a207f51
Parents: dbbd9c2
Author: Benjamin Mahler <bm...@apache.org>
Authored: Thu Aug 10 18:04:48 2017 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Thu Aug 24 13:10:29 2017 -0700

----------------------------------------------------------------------
 3rdparty/stout/include/stout/check.hpp | 50 ++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/1d56db0e/3rdparty/stout/include/stout/check.hpp
----------------------------------------------------------------------
diff --git a/3rdparty/stout/include/stout/check.hpp b/3rdparty/stout/include/stout/check.hpp
index 6a98519..9ebf513 100644
--- a/3rdparty/stout/include/stout/check.hpp
+++ b/3rdparty/stout/include/stout/check.hpp
@@ -53,10 +53,58 @@
   CHECK_STATE(CHECK_ERROR, _check_error, expression)
 
 
-// Private structs/functions used for CHECK_*.
+// A private helper for CHECK_NOTNONE which is similar to the
+// CHECK_NOTNULL provided by glog.
+template <typename T>
+T&& _check_not_none(
+    const char* file,
+    int line,
+    const char* message,
+    Option<T>&& t) {
+  if (t.isNone()) {
+    google::LogMessageFatal(file, line, new std::string(message));
+  }
+  return std::move(t).get();
+}
+
+
+template <typename T>
+T& _check_not_none(
+    const char* file,
+    int line,
+    const char* message,
+    Option<T>& t) {
+  if (t.isNone()) {
+    google::LogMessageFatal(file, line, new std::string(message));
+  }
+  return t.get();
+}
 
 
 template <typename T>
+const T& _check_not_none(
+    const char* file,
+    int line,
+    const char* message,
+    const Option<T>& t) {
+  if (t.isNone()) {
+    google::LogMessageFatal(file, line, new std::string(message));
+  }
+  return t.get();
+}
+
+
+#define CHECK_NOTNONE(expression) \
+  _check_not_none( \
+      __FILE__, \
+      __LINE__, \
+      "'" #expression "' Must be SOME", \
+      (expression))
+
+
+// Private structs/functions used for CHECK_*.
+
+template <typename T>
 Option<Error> _check_some(const Option<T>& o)
 {
   if (o.isNone()) {


[4/8] mesos git commit: Introduced a Framework::idle function in the agent.

Posted by bm...@apache.org.
Introduced a Framework::idle function in the agent.

This ensures the call-sites consistently check idleness of the
framework, it also aids readability in that it clarifies that
we remove idle frameworks.

Review: https://reviews.apache.org/r/61642


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/99570851
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/99570851
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/99570851

Branch: refs/heads/1.2.x
Commit: 99570851ac4307d91926ad33407577f1a1189866
Parents: ec86ad9
Author: Benjamin Mahler <bm...@apache.org>
Authored: Fri Aug 4 15:51:05 2017 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Thu Aug 24 13:10:29 2017 -0700

----------------------------------------------------------------------
 src/slave/slave.cpp | 32 +++++++++++++++++---------------
 src/slave/slave.hpp | 10 ++++++++++
 2 files changed, 27 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/99570851/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index a618fb2..754d463 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -1829,9 +1829,7 @@ void Slave::_run(
       statusUpdate(update, UPID());
     }
 
-    // Refer to the comment after 'framework->pending.erase' above
-    // for why we need this.
-    if (framework->executors.empty() && framework->pending.empty()) {
+    if (framework->idle()) {
       removeFramework(framework);
     }
 
@@ -1847,7 +1845,7 @@ void Slave::_run(
 
     // Refer to the comment after 'framework->pending.erase' above
     // for why we need this.
-    if (framework->executors.empty() && framework->pending.empty()) {
+    if (framework->idle()) {
       removeFramework(framework);
     }
 
@@ -1887,7 +1885,7 @@ void Slave::_run(
     }
 
     // for why we need this.
-    if (framework->executors.empty() && framework->pending.empty()) {
+    if (framework->idle()) {
       removeFramework(framework);
     }
 
@@ -1953,7 +1951,7 @@ void Slave::_run(
 
     // Refer to the comment after 'framework->pending.erase' above
     // for why we need this.
-    if (framework->executors.empty() && framework->pending.empty()) {
+    if (framework->idle()) {
       removeFramework(framework);
     }
 
@@ -2006,7 +2004,7 @@ void Slave::_run(
 
     // Refer to the comment after 'framework->pending.erase' above
     // for why we need this.
-    if (framework->executors.empty() && framework->pending.empty()) {
+    if (framework->idle()) {
       removeFramework(framework);
     }
 
@@ -2025,7 +2023,7 @@ void Slave::_run(
 
     // Refer to the comment after 'framework->pending.erase' above
     // for why we need this.
-    if (framework->executors.empty() && framework->pending.empty()) {
+    if (framework->idle()) {
       removeFramework(framework);
     }
 
@@ -2697,7 +2695,7 @@ void Slave::shutdownFramework(
       }
 
       // Remove this framework if it has no pending executors and tasks.
-      if (framework->executors.empty() && framework->pending.empty()) {
+      if (framework->idle()) {
         removeFramework(framework);
       }
       break;
@@ -3139,7 +3137,7 @@ void Slave::_statusUpdateAcknowledgement(
   }
 
   // Remove this framework if it has no pending executors and tasks.
-  if (framework->executors.empty() && framework->pending.empty()) {
+  if (framework->idle()) {
     removeFramework(framework);
   }
 }
@@ -4891,7 +4889,7 @@ void Slave::executorTerminated(
       }
 
       // Remove this framework if it has no pending executors and tasks.
-      if (framework->executors.empty() && framework->pending.empty()) {
+      if (framework->idle()) {
         removeFramework(framework);
       }
       break;
@@ -5003,10 +5001,8 @@ void Slave::removeFramework(Framework* framework)
   CHECK(framework->state == Framework::RUNNING ||
         framework->state == Framework::TERMINATING);
 
-  // The invariant here is that a framework should not be removed
-  // if it has either pending executors or pending tasks.
-  CHECK(framework->executors.empty());
-  CHECK(framework->pending.empty());
+  // We only remove frameworks once they become idle.
+  CHECK(framework->idle());
 
   // Close all status update streams for this framework.
   statusUpdateManager->cleanup(framework->id());
@@ -6478,6 +6474,12 @@ Framework::Framework(
     completedExecutors(slaveFlags.max_completed_executors_per_framework) {}
 
 
+bool Framework::idle() const
+{
+  return executors.empty() && pending.empty();
+}
+
+
 void Framework::checkpointFramework() const
 {
   // Checkpoint the framework info.

http://git-wip-us.apache.org/repos/asf/mesos/blob/99570851/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index bc6f5f7..495d5d1 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -1046,6 +1046,16 @@ struct Framework
   Executor* launchExecutor(
       const ExecutorInfo& executorInfo,
       const Option<TaskInfo>& taskInfo);
+
+  // Returns whether the framework is idle, where idle is
+  // defined as having no activity:
+  //   (1) The framework has no non-terminal tasks and executors.
+  //   (2) All status updates have been acknowledged.
+  //
+  // TODO(bmahler): The framework should also not be considered
+  // idle if there are unacknowledged updates for "pending" tasks.
+  bool idle() const;
+
   void destroyExecutor(const ExecutorID& executorId);
   Executor* getExecutor(const ExecutorID& executorId);
   Executor* getExecutor(const TaskID& taskId);


[7/8] mesos git commit: Renamed Framework::pending to Framework::pendingTasks in agent.

Posted by bm...@apache.org.
Renamed Framework::pending to Framework::pendingTasks in agent.

Review: https://reviews.apache.org/r/61643


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/dbbd9c20
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/dbbd9c20
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/dbbd9c20

Branch: refs/heads/1.2.x
Commit: dbbd9c20fac634a839288a0cae526ff35c5d0bc0
Parents: 9957085
Author: Benjamin Mahler <bm...@apache.org>
Authored: Fri Aug 4 16:56:40 2017 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Thu Aug 24 13:10:29 2017 -0700

----------------------------------------------------------------------
 src/slave/http.cpp  |  2 +-
 src/slave/slave.cpp | 24 ++++++++++++------------
 src/slave/slave.hpp |  6 +++---
 3 files changed, 16 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/dbbd9c20/src/slave/http.cpp
----------------------------------------------------------------------
diff --git a/src/slave/http.cpp b/src/slave/http.cpp
index ccd489d..df64406 100644
--- a/src/slave/http.cpp
+++ b/src/slave/http.cpp
@@ -1606,7 +1606,7 @@ agent::Response::GetTasks Slave::Http::_getTasks(
   foreach (const Framework* framework, frameworks) {
     // Pending tasks.
     typedef hashmap<TaskID, TaskInfo> TaskMap;
-    foreachvalue (const TaskMap& taskInfos, framework->pending) {
+    foreachvalue (const TaskMap& taskInfos, framework->pendingTasks) {
       foreachvalue (const TaskInfo& taskInfo, taskInfos) {
         // Skip unauthorized tasks.
         if (!approveViewTaskInfo(tasksApprover, taskInfo, framework->info)) {

http://git-wip-us.apache.org/repos/asf/mesos/blob/dbbd9c20/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 754d463..0a57be5 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -1422,7 +1422,7 @@ void Slave::doReliableRegistration(Duration maxBackoff)
       // pending tasks, and we need to send exited events if they
       // cannot be launched, see MESOS-1715, MESOS-1720, MESOS-1800.
       typedef hashmap<TaskID, TaskInfo> TaskMap;
-      foreachvalue (const TaskMap& tasks, framework->pending) {
+      foreachvalue (const TaskMap& tasks, framework->pendingTasks) {
         foreachvalue (const TaskInfo& task, tasks) {
           message.add_tasks()->CopyFrom(protobuf::createTask(
               task, TASK_STAGING, framework->id()));
@@ -1732,7 +1732,7 @@ void Slave::run(
   // are not scheduled for deletion before '_run()' is called.
   CHECK_NOTNULL(framework);
   foreach (const TaskInfo& _task, tasks) {
-    framework->pending[executorId][_task.task_id()] = _task;
+    framework->pendingTasks[executorId][_task.task_id()] = _task;
   }
 
   // If we are about to create a new executor, unschedule the top
@@ -2462,7 +2462,7 @@ void Slave::killTask(
 
     // We send the TASK_KILLED status update in `_run()` as the
     // task being killed could be part of a task group and we
-    // don't store this information in `framework->pending`.
+    // don't store this information in `framework->pendingTasks`.
     // We don't invoke `removeFramework()` here since we need the
     // framework to be valid for sending the status update later.
     return;
@@ -4953,7 +4953,7 @@ void Slave::removeExecutor(Framework* framework, Executor* executor)
 
   // Schedule the top level executor work directory, only if the
   // framework doesn't have any 'pending' tasks for this executor.
-  if (!framework->pending.contains(executor->id)) {
+  if (!framework->pendingTasks.contains(executor->id)) {
     const string path = paths::getExecutorPath(
         flags.work_dir, info.id(), framework->id(), executor->id);
 
@@ -4975,7 +4975,7 @@ void Slave::removeExecutor(Framework* framework, Executor* executor)
 
     // Schedule the top level executor meta directory, only if the
     // framework doesn't have any 'pending' tasks for this executor.
-    if (!framework->pending.contains(executor->id)) {
+    if (!framework->pendingTasks.contains(executor->id)) {
       const string path = paths::getExecutorPath(
           metaDir, info.id(), framework->id(), executor->id);
 
@@ -6118,7 +6118,7 @@ double Slave::_tasks_staging()
   double count = 0.0;
   foreachvalue (Framework* framework, frameworks) {
     typedef hashmap<TaskID, TaskInfo> TaskMap;
-    foreachvalue (const TaskMap& tasks, framework->pending) {
+    foreachvalue (const TaskMap& tasks, framework->pendingTasks) {
       count += tasks.size();
     }
 
@@ -6476,7 +6476,7 @@ Framework::Framework(
 
 bool Framework::idle() const
 {
-  return executors.empty() && pending.empty();
+  return executors.empty() && pendingTasks.empty();
 }
 
 
@@ -6908,11 +6908,11 @@ void Framework::recoverExecutor(const ExecutorState& state)
 
 bool Framework::removePendingTask(const TaskID& taskId)
 {
-  foreachkey (const ExecutorID& executorId, pending) {
-    if (pending.at(executorId).contains(taskId)) {
-      pending.at(executorId).erase(taskId);
-      if (pending.at(executorId).empty()) {
-        pending.erase(executorId);
+  foreachkey (const ExecutorID& executorId, pendingTasks) {
+    if (pendingTasks.at(executorId).contains(taskId)) {
+      pendingTasks.at(executorId).erase(taskId);
+      if (pendingTasks.at(executorId).empty()) {
+        pendingTasks.erase(executorId);
       }
       return true;
     }

http://git-wip-us.apache.org/repos/asf/mesos/blob/dbbd9c20/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index 495d5d1..1c4f625 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -1097,7 +1097,7 @@ struct Framework
   // being bypassed, and provide public views into them.
 
   // Executors with pending tasks.
-  hashmap<ExecutorID, hashmap<TaskID, TaskInfo>> pending;
+  hashmap<ExecutorID, hashmap<TaskID, TaskInfo>> pendingTasks;
 
   // Current running executors.
   hashmap<ExecutorID, Executor*> executors;
@@ -1106,8 +1106,8 @@ struct Framework
 
   bool hasTask(const TaskID& taskId)
   {
-    foreachkey (const ExecutorID& executorId, pending) {
-      if (pending[executorId].contains(taskId)) {
+    foreachkey (const ExecutorID& executorId, pendingTasks) {
+      if (pendingTasks[executorId].contains(taskId)) {
         return true;
       }
     }


[8/8] mesos git commit: Added MESOS-7863 to the 1.2.3 CHANGELOG.

Posted by bm...@apache.org.
Added MESOS-7863 to the 1.2.3 CHANGELOG.


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/928751aa
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/928751aa
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/928751aa

Branch: refs/heads/1.2.x
Commit: 928751aa317b65587bbb478031ce8698a7ef3818
Parents: ae6753d
Author: Benjamin Mahler <bm...@apache.org>
Authored: Thu Aug 24 13:01:41 2017 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Thu Aug 24 13:10:56 2017 -0700

----------------------------------------------------------------------
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/928751aa/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index ed94650..e87d7bc 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -8,6 +8,7 @@ All Issues:
   * [MESOS-6950] - Launching two tasks with the same Docker image simultaneously may cause a staging dir never cleaned up.
   * [MESOS-7652] - Docker image with universal containerizer does not work if WORKDIR is missing in the rootfs.
   * [MESOS-7858] - Launching a nested container with namespace/pid isolation, with glibc < 2.25, may deadlock the LinuxLauncher and MesosContainerizer.
+  * [MESOS-7863] - Agent may drop pending kill task status updates.
   * [MESOS-7865] - Agent may process a kill task and still launch the task.
   * [MESOS-7909] - Ordering dependency between 'linux/capabilities' and 'docker/runtime' isolator.
 


[3/8] mesos git commit: Updated Framework::removePendingTask to take only a TaskID.

Posted by bm...@apache.org.
Updated Framework::removePendingTask to take only a TaskID.

This allows the function to be used in the kill task path where we
do not have the TaskInfo or ExecutorInfo available. With this, we
now have all removals going through this function.

Review: https://reviews.apache.org/r/61641


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/ec86ad92
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/ec86ad92
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/ec86ad92

Branch: refs/heads/1.2.x
Commit: ec86ad92ef932f9a95b06920e3e875e484e80462
Parents: 0276d38
Author: Benjamin Mahler <bm...@apache.org>
Authored: Fri Aug 4 15:46:21 2017 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Thu Aug 24 13:10:29 2017 -0700

----------------------------------------------------------------------
 src/slave/slave.cpp | 68 +++++++++++++++++++++++-------------------------
 src/slave/slave.hpp |  3 +++
 2 files changed, 35 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/ec86ad92/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index db60b93..a618fb2 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -1801,28 +1801,11 @@ void Slave::_run(
 
   const ExecutorID& executorId = executorInfo.executor_id();
 
-  // Remove the task/task group from being pending. If any of the
-  // tasks in the task group have been killed in the interim, we
-  // send a TASK_KILLED for all the other tasks in the group.
+  // If any of the tasks in the task group have been killed in the interim,
+  // we send a TASK_KILLED for all the other tasks in the group.
   bool killed = false;
   foreach (const TaskInfo& _task, tasks) {
-    if (framework->pending.contains(executorId) &&
-        framework->pending[executorId].contains(_task.task_id())) {
-      framework->pending[executorId].erase(_task.task_id());
-      if (framework->pending[executorId].empty()) {
-        framework->pending.erase(executorId);
-        // NOTE: Ideally we would perform the following check here:
-        //
-        //   if (framework->executors.empty() &&
-        //       framework->pending.empty()) {
-        //     removeFramework(framework);
-        //   }
-        //
-        // However, we need 'framework' to stay valid for the rest of
-        // this function. As such, we perform the check before each of
-        // the 'return' statements below.
-      }
-    } else {
+    if (!framework->removePendingTask(_task.task_id())) {
       killed = true;
     }
   }
@@ -2471,23 +2454,20 @@ void Slave::killTask(
     return;
   }
 
-  foreachkey (const ExecutorID& executorId, framework->pending) {
-    if (framework->pending[executorId].contains(taskId)) {
-      LOG(WARNING) << "Killing task " << taskId
-                   << " of framework " << frameworkId
-                   << " before it was launched";
+  // TODO(bmahler): Removing the task here is a bug: MESOS-7783.
+  bool removedWhilePending = framework->removePendingTask(taskId);
 
-      // We send the TASK_KILLED status update in `_run()` as the
-      // task being killed could be part of a task group and we
-      // don't store this information in `framework->pending`.
-      // We don't invoke `removeFramework()` here since we need the
-      // framework to be valid for sending the status update later.
-     framework->pending[executorId].erase(taskId);
-     if (framework->pending[executorId].empty()) {
-       framework->pending.erase(executorId);
-     }
-     return;
-    }
+  if (removedWhilePending) {
+    LOG(WARNING) << "Killing task " << taskId
+                 << " of framework " << frameworkId
+                 << " before it was launched";
+
+    // We send the TASK_KILLED status update in `_run()` as the
+    // task being killed could be part of a task group and we
+    // don't store this information in `framework->pending`.
+    // We don't invoke `removeFramework()` here since we need the
+    // framework to be valid for sending the status update later.
+    return;
   }
 
   Executor* executor = framework->getExecutor(taskId);
@@ -6924,6 +6904,22 @@ void Framework::recoverExecutor(const ExecutorState& state)
 }
 
 
+bool Framework::removePendingTask(const TaskID& taskId)
+{
+  foreachkey (const ExecutorID& executorId, pending) {
+    if (pending.at(executorId).contains(taskId)) {
+      pending.at(executorId).erase(taskId);
+      if (pending.at(executorId).empty()) {
+        pending.erase(executorId);
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
 Executor::Executor(
     Slave* _slave,
     const FrameworkID& _frameworkId,

http://git-wip-us.apache.org/repos/asf/mesos/blob/ec86ad92/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index 598a3fb..bc6f5f7 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -1052,6 +1052,9 @@ struct Framework
   void recoverExecutor(const state::ExecutorState& state);
   void checkpointFramework() const;
 
+  // Returns whether the pending task was removed.
+  bool removePendingTask(const TaskID& taskId);
+
   const FrameworkID id() const { return info.id(); }
 
   enum State