You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bb...@apache.org on 2019/04/10 08:58:23 UTC

[mesos] branch 1.8.x updated: Fixed `AgentFailoverHTTPExecutorUsingResourceProviderResources` flake.

This is an automated email from the ASF dual-hosted git repository.

bbannier pushed a commit to branch 1.8.x
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/1.8.x by this push:
     new 2f8c3d1  Fixed `AgentFailoverHTTPExecutorUsingResourceProviderResources` flake.
2f8c3d1 is described below

commit 2f8c3d17691df767bb0fac1274077bbc8b6da0f0
Author: Chun-Hung Hsiao <ch...@apache.org>
AuthorDate: Wed Apr 10 10:07:20 2019 +0200

    Fixed `AgentFailoverHTTPExecutorUsingResourceProviderResources` flake.
    
    The test is flaky because:
    
      * It assumes the mock RP never reregisters, which might not be true.
    
      * It does not wait for the task and executor to be reaped, which would
        lead to a race between containerizer destroy and test teardown and
        cause cgroups cleanup to fail.
    
      * It fast-forwards the clock, which might lead to containerizer
        timed out to wait for all nested containers to be killed.
    
      * It assumes that the framework only receives two status updates,
        which might not be true.
    
    Review: https://reviews.apache.org/r/70439/
---
 src/tests/mesos.hpp       | 25 +++++++++++++++++++++++++
 src/tests/slave_tests.cpp | 22 ++++++++++++++--------
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/src/tests/mesos.hpp b/src/tests/mesos.hpp
index 1a6d826..d8c17ea 100644
--- a/src/tests/mesos.hpp
+++ b/src/tests/mesos.hpp
@@ -3086,8 +3086,18 @@ public:
     driver->start();
   }
 
+  void stop()
+  {
+    driver.reset();
+  }
+
   void connectedDefault()
   {
+    // Do nothing if this is asynchronously called after `stop` is invoked.
+    if (driver == nullptr) {
+      return;
+    }
+
     Call call;
     call.set_type(Call::SUBSCRIBE);
     call.mutable_subscribe()->mutable_resource_provider_info()->CopyFrom(info);
@@ -3097,6 +3107,11 @@ public:
 
   void subscribedDefault(const typename Event::Subscribed& subscribed)
   {
+    // Do nothing if this is asynchronously called after `stop` is invoked.
+    if (driver == nullptr) {
+      return;
+    }
+
     info.mutable_id()->CopyFrom(subscribed.provider_id());
 
     if (resources.isSome()) {
@@ -3122,6 +3137,11 @@ public:
 
   void operationDefault(const typename Event::ApplyOperation& operation)
   {
+    // Do nothing if this is asynchronously called after `stop` is invoked.
+    if (driver == nullptr) {
+      return;
+    }
+
     CHECK(info.has_id());
 
     Call call;
@@ -3196,6 +3216,11 @@ public:
 
   void publishDefault(const typename Event::PublishResources& publish)
   {
+    // Do nothing if this is asynchronously called after `stop` is invoked.
+    if (driver == nullptr) {
+      return;
+    }
+
     CHECK(info.has_id());
 
     Call call;
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index b1c3a01..6df461b 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -139,6 +139,7 @@ using std::string;
 using std::vector;
 
 using testing::_;
+using testing::AtLeast;
 using testing::AtMost;
 using testing::DoAll;
 using testing::Eq;
@@ -11691,6 +11692,7 @@ TEST_F(SlaveTest, AgentFailoverHTTPExecutorUsingResourceProviderResources)
   Future<v1::scheduler::Event::Update> taskStarting;
   Future<v1::scheduler::Event::Update> taskRunning;
   EXPECT_CALL(*scheduler, update(_, _))
+    .Times(AtLeast(2))
     .WillOnce(DoAll(
         v1::scheduler::SendAcknowledge(frameworkId, agentId),
         FutureArg<1>(&taskStarting)))
@@ -11732,22 +11734,26 @@ TEST_F(SlaveTest, AgentFailoverHTTPExecutorUsingResourceProviderResources)
   // Fail over the agent. We expect the agent to destroy the running
   // container since the resource provider has not resubscribed when
   // the executor resubscribes which prevents publishing of used resources.
-  EXPECT_CALL(resourceProvider, disconnected());
+  EXPECT_CALL(resourceProvider, disconnected())
+    .Times(AtMost(1));
 
   slave.get()->terminate();
 
-  Future<Nothing> destroy =
-    FUTURE_DISPATCH(_, &MesosContainerizerProcess::destroy);
+  // Stop the mock resource provider so it won't resubscribe. This ensures that
+  // the executor container will be destroyed during agent recovery.
+  resourceProvider.stop();
+
+  Future<Nothing> executorTerminated =
+    FUTURE_DISPATCH(_, &Slave::executorTerminated);
 
   slave = StartSlave(&detector, processId, slaveFlags);
   ASSERT_SOME(slave);
 
-  AWAIT_READY(destroy);
+  // Resume the clock so the task and the executor can be reaped.
+  Clock::resume();
 
-  // Trigger a executor registration timeout so the agent finishes
-  // recovery and can shut down as part of being destructed.
-  Clock::advance(slaveFlags.executor_registration_timeout);
-  Clock::settle();
+  // Wait until the executor is reaped.
+  AWAIT_READY(executorTerminated);
 
   // NOTE: We do not check that the task is reported as `TASK_LOST`
   // to the framework since we have not made sure that the task status