You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bb...@apache.org on 2019/04/10 08:58:23 UTC
[mesos] branch 1.8.x updated: Fixed
`AgentFailoverHTTPExecutorUsingResourceProviderResources` flake.
This is an automated email from the ASF dual-hosted git repository.
bbannier pushed a commit to branch 1.8.x
in repository https://gitbox.apache.org/repos/asf/mesos.git
The following commit(s) were added to refs/heads/1.8.x by this push:
new 2f8c3d1 Fixed `AgentFailoverHTTPExecutorUsingResourceProviderResources` flake.
2f8c3d1 is described below
commit 2f8c3d17691df767bb0fac1274077bbc8b6da0f0
Author: Chun-Hung Hsiao <ch...@apache.org>
AuthorDate: Wed Apr 10 10:07:20 2019 +0200
Fixed `AgentFailoverHTTPExecutorUsingResourceProviderResources` flake.
The test is flaky because:
* It assumes the mock RP never reregisters, which might not be true.
* It does not wait for the task and executor to be reaped, which would
lead to a race between containerizer destroy and test teardown and
cause cgroups cleanup to fail.
* It fast-forwards the clock, which might lead to containerizer
timed out to wait for all nested containers to be killed.
* It assumes that the framework only receives two status updates,
which might not be true.
Review: https://reviews.apache.org/r/70439/
---
src/tests/mesos.hpp | 25 +++++++++++++++++++++++++
src/tests/slave_tests.cpp | 22 ++++++++++++++--------
2 files changed, 39 insertions(+), 8 deletions(-)
diff --git a/src/tests/mesos.hpp b/src/tests/mesos.hpp
index 1a6d826..d8c17ea 100644
--- a/src/tests/mesos.hpp
+++ b/src/tests/mesos.hpp
@@ -3086,8 +3086,18 @@ public:
driver->start();
}
+ void stop()
+ {
+ driver.reset();
+ }
+
void connectedDefault()
{
+ // Do nothing if this is asynchronously called after `stop` is invoked.
+ if (driver == nullptr) {
+ return;
+ }
+
Call call;
call.set_type(Call::SUBSCRIBE);
call.mutable_subscribe()->mutable_resource_provider_info()->CopyFrom(info);
@@ -3097,6 +3107,11 @@ public:
void subscribedDefault(const typename Event::Subscribed& subscribed)
{
+ // Do nothing if this is asynchronously called after `stop` is invoked.
+ if (driver == nullptr) {
+ return;
+ }
+
info.mutable_id()->CopyFrom(subscribed.provider_id());
if (resources.isSome()) {
@@ -3122,6 +3137,11 @@ public:
void operationDefault(const typename Event::ApplyOperation& operation)
{
+ // Do nothing if this is asynchronously called after `stop` is invoked.
+ if (driver == nullptr) {
+ return;
+ }
+
CHECK(info.has_id());
Call call;
@@ -3196,6 +3216,11 @@ public:
void publishDefault(const typename Event::PublishResources& publish)
{
+ // Do nothing if this is asynchronously called after `stop` is invoked.
+ if (driver == nullptr) {
+ return;
+ }
+
CHECK(info.has_id());
Call call;
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index b1c3a01..6df461b 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -139,6 +139,7 @@ using std::string;
using std::vector;
using testing::_;
+using testing::AtLeast;
using testing::AtMost;
using testing::DoAll;
using testing::Eq;
@@ -11691,6 +11692,7 @@ TEST_F(SlaveTest, AgentFailoverHTTPExecutorUsingResourceProviderResources)
Future<v1::scheduler::Event::Update> taskStarting;
Future<v1::scheduler::Event::Update> taskRunning;
EXPECT_CALL(*scheduler, update(_, _))
+ .Times(AtLeast(2))
.WillOnce(DoAll(
v1::scheduler::SendAcknowledge(frameworkId, agentId),
FutureArg<1>(&taskStarting)))
@@ -11732,22 +11734,26 @@ TEST_F(SlaveTest, AgentFailoverHTTPExecutorUsingResourceProviderResources)
// Fail over the agent. We expect the agent to destroy the running
// container since the resource provider has not resubscribed when
// the executor resubscribes which prevents publishing of used resources.
- EXPECT_CALL(resourceProvider, disconnected());
+ EXPECT_CALL(resourceProvider, disconnected())
+ .Times(AtMost(1));
slave.get()->terminate();
- Future<Nothing> destroy =
- FUTURE_DISPATCH(_, &MesosContainerizerProcess::destroy);
+ // Stop the mock resource provider so it won't resubscribe. This ensures that
+ // the executor container will be destroyed during agent recovery.
+ resourceProvider.stop();
+
+ Future<Nothing> executorTerminated =
+ FUTURE_DISPATCH(_, &Slave::executorTerminated);
slave = StartSlave(&detector, processId, slaveFlags);
ASSERT_SOME(slave);
- AWAIT_READY(destroy);
+ // Resume the clock so the task and the executor can be reaped.
+ Clock::resume();
- // Trigger a executor registration timeout so the agent finishes
- // recovery and can shut down as part of being destructed.
- Clock::advance(slaveFlags.executor_registration_timeout);
- Clock::settle();
+ // Wait until the executor is reaped.
+ AWAIT_READY(executorTerminated);
// NOTE: We do not check that the task is reported as `TASK_LOST`
// to the framework since we have not made sure that the task status