You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ch...@apache.org on 2018/11/28 23:27:03 UTC
[mesos] 01/09: Fixed master crash when executors send messages to
recovered frameworks.
This is an automated email from the ASF dual-hosted git repository.
chhsiao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 276774c4d88dbd2f5147c90c53b795e1ec889494
Author: Chun-Hung Hsiao <ch...@mesosphere.io>
AuthorDate: Mon Nov 26 20:12:36 2018 -0800
Fixed master crash when executors send messages to recovered frameworks.
The `Framework::send` function assumes that either `http` or `pid` is
set, which is not true for a framework that hasn't yet reregistered yet
but recovered from a reregistered agent. As a result, the master would
crash when a recovered executor tries to send a message to such a
framework (see MESOS-9419). This patch fixes this crash bug.
Review: https://reviews.apache.org/r/69451
---
src/master/master.cpp | 10 ++++++++++
src/master/master.hpp | 26 ++++++++++++++++++++------
2 files changed, 30 insertions(+), 6 deletions(-)
diff --git a/src/master/master.cpp b/src/master/master.cpp
index b4b02d8..3b3824a 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -6482,6 +6482,16 @@ void Master::executorMessage(
return;
}
+ if (!framework->connected()) {
+ LOG(WARNING) << "Not forwarding executor message for executor '"
+ << executorId << "' of framework " << frameworkId
+ << " on agent " << *slave
+ << " because the framework is disconnected";
+
+ metrics->invalid_executor_to_framework_messages++;
+ return;
+ }
+
ExecutorToFrameworkMessage message;
*message.mutable_slave_id() =
std::move(*executorToFrameworkMessage.mutable_slave_id());
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 3b3c1a4..c7becfa 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -2591,21 +2591,35 @@ private:
template <typename Message>
void Framework::send(const Message& message)
{
+ metrics.incrementEvent(message);
+
if (!connected()) {
- LOG(WARNING) << "Master attempted to send message to disconnected"
+ LOG(WARNING) << "Master attempting to send message to disconnected"
<< " framework " << *this;
- }
- metrics.incrementEvent(message);
+ // NOTE: We proceed here without returning to support the case where a
+ // "disconnected" framework is still talking to the master and the master
+ // wants to shut it down by sending a `FrameworkErrorMessage`. This can
+ // occur in a one-way network partition where the master -> framework link
+ // is broken but the framework -> master link remains intact. Note that we
+ // have no periodic heartbeats between the master and pid-based schedulers.
+ //
+ // TODO(chhsiao): Update the `FrameworkErrorMessage` call-sites that rely on
+ // the lack of a `return` here to directly call `process::send` so that this
+ // function doesn't need to deal with the special case. Then we can check
+ // that one of `http` or `pid` is set if the framework is connected.
+ }
if (http.isSome()) {
if (!http->send(message)) {
- LOG(WARNING) << "Unable to send event to framework " << *this << ":"
+ LOG(WARNING) << "Unable to send message to framework " << *this << ":"
<< " connection closed";
}
- } else {
- CHECK_SOME(pid);
+ } else if (pid.isSome()) {
master->send(pid.get(), message);
+ } else {
+ LOG(WARNING) << "Unable to send message to framework " << *this << ":"
+ << " framework is recovered but has not reregistered";
}
}