You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2013/09/04 09:09:29 UTC
[2/6] git commit: Fixed a CHECK failure in the master when successive
exited events occur for a disconnected slave.
Fixed a CHECK failure in the master when successive exited events
occur for a disconnected slave.
Review: https://reviews.apache.org/r/13956
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/dc2ab2ff
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/dc2ab2ff
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/dc2ab2ff
Branch: refs/heads/master
Commit: dc2ab2ff4b868c479701fcf6aa2d43903e32b95c
Parents: 3683ab6
Author: Benjamin Mahler <bm...@twitter.com>
Authored: Tue Sep 3 17:53:03 2013 -0700
Committer: Benjamin Mahler <bm...@twitter.com>
Committed: Tue Sep 3 22:48:43 2013 -0700
----------------------------------------------------------------------
src/master/master.cpp | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/dc2ab2ff/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 874d7fa..a2ffe7f 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -557,11 +557,7 @@ void Master::exited(const UPID& pid)
<< "because it is not checkpointing!";
removeSlave(slave);
return;
- } else {
- CHECK(!slave->disconnected)
- << "Slave " << slave->id << " ("
- << slave->info.hostname() << ") already disconnected!" ;
-
+ } else if (!slave->disconnected) {
// Mark the slave as disconnected and remove it from the allocator.
slave->disconnected = true;
@@ -601,6 +597,10 @@ void Master::exited(const UPID& pid)
// Remove and rescind offers.
removeOffer(offer, true); // Rescind!
}
+ } else {
+ LOG(WARNING) << "Ignoring duplicate exited() notification for "
+ << "checkpointing slave " << slave->id
+ << " (" << slave->info.hostname() << ")";
}
}
}
@@ -1143,6 +1143,12 @@ void Master::reregisterSlave(const SlaveID& slaveId,
reply(message);
// Update the slave pid and relink to it.
+ // NOTE: Re-linking the slave here always rather than only when
+ // the slave is disconnected can lead to multiple exited events
+ // in succession for a disconnected slave. As a result, we
+ // ignore duplicate exited events for disconnected checkpointing
+ // slaves.
+ // See: https://issues.apache.org/jira/browse/MESOS-675
slave->pid = from;
link(slave->pid);