You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gr...@apache.org on 2019/09/13 02:17:16 UTC

[mesos] branch master updated: Fixed a bug for non-partition-aware schedulers.

This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
     new 8e1a512  Fixed a bug for non-partition-aware schedulers.
8e1a512 is described below

commit 8e1a51207304589a6521cff3540e0705fe1533ff
Author: Greg Mann <gr...@mesosphere.io>
AuthorDate: Thu Sep 12 16:33:20 2019 -0700

    Fixed a bug for non-partition-aware schedulers.
    
    Previously, the agent would send task status updates with the state
    TASK_GONE_BY_OPERATOR to all schedulers when an agent was drained
    with the `mark_gone` parameter set to `true`.
    
    This patch updates this code to ensure that TASK_GONE_BY_OPERATOR
    is only sent to partition-aware schedulers.
    
    Review: https://reviews.apache.org/r/71480/
---
 src/slave/slave.cpp       | 69 ++++++++++++++++++++++++-----------------------
 src/tests/slave_tests.cpp | 20 +++++++++++---
 2 files changed, 52 insertions(+), 37 deletions(-)

diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 4e93656..96890d3 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -5773,40 +5773,6 @@ void Slave::statusUpdate(StatusUpdate update, const Option<UPID>& pid)
   update.mutable_status()->set_source(
       pid == UPID() ? TaskStatus::SOURCE_SLAVE : TaskStatus::SOURCE_EXECUTOR);
 
-  // If the agent is draining we provide additional
-  // information for KILLING or KILLED states.
-  if (drainConfig.isSome()) {
-    switch (update.status().state()) {
-      case TASK_STAGING:
-      case TASK_STARTING:
-      case TASK_RUNNING:
-      case TASK_FAILED:
-      case TASK_FINISHED:
-      case TASK_ERROR:
-      case TASK_LOST:
-      case TASK_DROPPED:
-      case TASK_UNREACHABLE:
-      case TASK_GONE:
-      case TASK_GONE_BY_OPERATOR:
-      case TASK_UNKNOWN: {
-        break;
-      }
-      case TASK_KILLING:
-      case TASK_KILLED: {
-        // We unconditionally overwrite any previous reason to provide a
-        // consistent signal that this task went away during draining.
-        update.mutable_status()->set_reason(TaskStatus::REASON_SLAVE_DRAINING);
-
-        // If the draining marks the agent as gone report tasks as
-        // gone by operator.
-        if (drainConfig->mark_gone()) {
-          update.mutable_status()->set_state(TASK_GONE_BY_OPERATOR);
-        }
-        break;
-      }
-    }
-  }
-
   // Set TaskStatus.executor_id if not already set; overwrite existing
   // value if already set.
   if (update.has_executor_id()) {
@@ -5843,6 +5809,41 @@ void Slave::statusUpdate(StatusUpdate update, const Option<UPID>& pid)
     return;
   }
 
+  // If the agent is draining we provide additional
+  // information for KILLING or KILLED states.
+  if (drainConfig.isSome()) {
+    switch (update.status().state()) {
+      case TASK_STAGING:
+      case TASK_STARTING:
+      case TASK_RUNNING:
+      case TASK_FAILED:
+      case TASK_FINISHED:
+      case TASK_ERROR:
+      case TASK_LOST:
+      case TASK_DROPPED:
+      case TASK_UNREACHABLE:
+      case TASK_GONE:
+      case TASK_GONE_BY_OPERATOR:
+      case TASK_UNKNOWN: {
+        break;
+      }
+      case TASK_KILLING:
+      case TASK_KILLED: {
+        // We unconditionally overwrite any previous reason to provide a
+        // consistent signal that this task went away during draining.
+        update.mutable_status()->set_reason(TaskStatus::REASON_SLAVE_DRAINING);
+
+        // If the draining marks the agent as gone report tasks as
+        // gone by operator.
+        if (drainConfig->mark_gone() &&
+            framework->capabilities.partitionAware) {
+          update.mutable_status()->set_state(TASK_GONE_BY_OPERATOR);
+        }
+        break;
+      }
+    }
+  }
+
   if (HookManager::hooksAvailable()) {
     // Even though the hook(s) return a TaskStatus, we only use two fields:
     // container_status and labels. Remaining fields are discarded.
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 02b65a9..c147bfc 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -12040,10 +12040,16 @@ TEST_F(SlaveTest, DrainAgentKillsRunningTask)
 
   AWAIT_READY(updateSlaveMessage);
 
+  // Set the partition-aware capability to ensure that the terminal update state
+  // is TASK_GONE_BY_OPERATOR, since we will set `mark_gone = true`.
+  v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      v1::FrameworkInfo::Capability::PARTITION_AWARE);
+
   auto scheduler = std::make_shared<v1::MockHTTPScheduler>();
 
   EXPECT_CALL(*scheduler, connected(_))
-    .WillOnce(v1::scheduler::SendSubscribe(v1::DEFAULT_FRAMEWORK_INFO));
+    .WillOnce(v1::scheduler::SendSubscribe(frameworkInfo));
 
   Future<v1::scheduler::Event::Subscribed> subscribed;
   EXPECT_CALL(*scheduler, subscribed(_, _))
@@ -12160,10 +12166,16 @@ TEST_F(SlaveTest, DrainAgentKillsQueuedTask)
 
   AWAIT_READY(updateSlaveMessage);
 
+  // Set the partition-aware capability to ensure that the terminal update state
+  // is TASK_GONE_BY_OPERATOR, since we will set `mark_gone = true`.
+  v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      v1::FrameworkInfo::Capability::PARTITION_AWARE);
+
   auto scheduler = std::make_shared<v1::MockHTTPScheduler>();
 
   EXPECT_CALL(*scheduler, connected(_))
-    .WillOnce(v1::scheduler::SendSubscribe(v1::DEFAULT_FRAMEWORK_INFO));
+    .WillOnce(v1::scheduler::SendSubscribe(frameworkInfo));
 
   Future<v1::scheduler::Event::Subscribed> subscribed;
   EXPECT_CALL(*scheduler, subscribed(_, _))
@@ -12351,7 +12363,9 @@ TEST_F(SlaveTest, DrainAgentKillsPendingTask)
 
   AWAIT_READY(killedUpdate);
 
-  EXPECT_EQ(v1::TASK_GONE_BY_OPERATOR, killedUpdate->status().state());
+  // The terminal update state in this case should be TASK_KILLED because the
+  // scheduler is not partition-aware.
+  EXPECT_EQ(v1::TASK_KILLED, killedUpdate->status().state());
   EXPECT_EQ(
       v1::TaskStatus::REASON_AGENT_DRAINING, killedUpdate->status().reason());
 }