You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2013/09/04 09:09:29 UTC

[2/6] git commit: Fixed a CHECK failure in the master when successive exited events occur for a disconnected slave.

Fixed a CHECK failure in the master when successive exited events
occur for a disconnected slave.

Review: https://reviews.apache.org/r/13956


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/dc2ab2ff
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/dc2ab2ff
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/dc2ab2ff

Branch: refs/heads/master
Commit: dc2ab2ff4b868c479701fcf6aa2d43903e32b95c
Parents: 3683ab6
Author: Benjamin Mahler <bm...@twitter.com>
Authored: Tue Sep 3 17:53:03 2013 -0700
Committer: Benjamin Mahler <bm...@twitter.com>
Committed: Tue Sep 3 22:48:43 2013 -0700

----------------------------------------------------------------------
 src/master/master.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/dc2ab2ff/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 874d7fa..a2ffe7f 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -557,11 +557,7 @@ void Master::exited(const UPID& pid)
                   << "because it is not checkpointing!";
         removeSlave(slave);
         return;
-      } else {
-        CHECK(!slave->disconnected)
-              << "Slave " << slave->id << " ("
-              << slave->info.hostname() << ") already disconnected!" ;
-
+      } else if (!slave->disconnected) {
         // Mark the slave as disconnected and remove it from the allocator.
         slave->disconnected = true;
 
@@ -601,6 +597,10 @@ void Master::exited(const UPID& pid)
           // Remove and rescind offers.
           removeOffer(offer, true); // Rescind!
         }
+      } else {
+        LOG(WARNING) << "Ignoring duplicate exited() notification for "
+                     << "checkpointing slave " << slave->id
+                     << " (" << slave->info.hostname() << ")";
       }
     }
   }
@@ -1143,6 +1143,12 @@ void Master::reregisterSlave(const SlaveID& slaveId,
       reply(message);
 
       // Update the slave pid and relink to it.
+      // NOTE: Re-linking the slave here always rather than only when
+      // the slave is disconnected can lead to multiple exited events
+      // in succession for a disconnected slave. As a result, we
+      // ignore duplicate exited events for disconnected checkpointing
+      // slaves.
+      // See: https://issues.apache.org/jira/browse/MESOS-675
       slave->pid = from;
       link(slave->pid);