You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2013/03/15 00:07:22 UTC
svn commit: r1456712 - in /incubator/mesos/trunk: include/mesos/mesos.proto
src/common/type_utils.hpp src/master/master.cpp src/slave/slave.cpp
src/tests/slave_recovery_tests.cpp
Author: vinodkone
Date: Thu Mar 14 23:07:21 2013
New Revision: 1456712
URL: http://svn.apache.org/r1456712
Log:
Added checkpoint field to slave info and fixed master to remove
slaves that have disabled checkpointing.
Review: https://reviews.apache.org/r/9927
Modified:
incubator/mesos/trunk/include/mesos/mesos.proto
incubator/mesos/trunk/src/common/type_utils.hpp
incubator/mesos/trunk/src/master/master.cpp
incubator/mesos/trunk/src/slave/slave.cpp
incubator/mesos/trunk/src/tests/slave_recovery_tests.cpp
Modified: incubator/mesos/trunk/include/mesos/mesos.proto
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/include/mesos/mesos.proto?rev=1456712&r1=1456711&r2=1456712&view=diff
==============================================================================
--- incubator/mesos/trunk/include/mesos/mesos.proto (original)
+++ incubator/mesos/trunk/include/mesos/mesos.proto Thu Mar 14 23:07:21 2013
@@ -166,6 +166,9 @@ message MasterInfo {
* hostnames (e.g., Amazon EC2). Note that the 'id' field is only
* available after a slave is registered with the master, and is made
* available here to facilitate re-registration.
+ * If checkpoint is set, the slave is checkpointing its own
+ * information and potentially frameworks' information (if a
+ * framework has checkpointing enabled).
*/
message SlaveInfo {
required string hostname = 1;
@@ -175,6 +178,7 @@ message SlaveInfo {
repeated Resource resources = 3;
repeated Attribute attributes = 5;
optional SlaveID id = 6;
+ optional bool checkpoint = 7 [default = false];
}
Modified: incubator/mesos/trunk/src/common/type_utils.hpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/common/type_utils.hpp?rev=1456712&r1=1456711&r2=1456712&view=diff
==============================================================================
--- incubator/mesos/trunk/src/common/type_utils.hpp (original)
+++ incubator/mesos/trunk/src/common/type_utils.hpp Thu Mar 14 23:07:21 2013
@@ -273,7 +273,9 @@ inline bool operator == (const SlaveInfo
left.has_webui_port() == right.has_webui_port() &&
(!left.has_webui_port() || (left.webui_port() == right.webui_port())) &&
left.has_id() == right.has_id() &&
- (!left.has_id() || (left.id() == right.id()));
+ (!left.has_id() || (left.id() == right.id())) &&
+ left.has_checkpoint() == right.has_checkpoint() &&
+ (!left.has_checkpoint() || (left.checkpoint() == right.checkpoint()));
}
Modified: incubator/mesos/trunk/src/master/master.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/master/master.cpp?rev=1456712&r1=1456711&r2=1456712&view=diff
==============================================================================
--- incubator/mesos/trunk/src/master/master.cpp (original)
+++ incubator/mesos/trunk/src/master/master.cpp Thu Mar 14 23:07:21 2013
@@ -515,6 +515,24 @@ void Master::exited(const UPID& pid)
return;
}
}
+
+ foreachvalue (Slave* slave, slaves) {
+ if (slave->pid == pid) {
+ LOG(INFO) << "Slave " << slave->id << "(" << slave->info.hostname()
+ << ") disconnected";
+
+ // Remove the slave, if it is not checkpointing.
+ // TODO(vinod): Even if a slave is checkpointing, transition all
+ // tasks of frameworks that have disabled checkpointing.
+ if (!slave->info.checkpoint()) {
+ LOG(INFO) << "Removing disconnected slave " << slave->id
+ << "(" << slave->info.hostname() << ") "
+ << "because it is not checkpointing!";
+ removeSlave(slave);
+ return;
+ }
+ }
+ }
}
Modified: incubator/mesos/trunk/src/slave/slave.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/slave/slave.cpp?rev=1456712&r1=1456711&r2=1456712&view=diff
==============================================================================
--- incubator/mesos/trunk/src/slave/slave.cpp (original)
+++ incubator/mesos/trunk/src/slave/slave.cpp Thu Mar 14 23:07:21 2013
@@ -225,6 +225,7 @@ void Slave::initialize()
info.set_webui_hostname(webui_hostname); // Deprecated!
info.mutable_resources()->MergeFrom(resources);
info.mutable_attributes()->MergeFrom(attributes);
+ info.set_checkpoint(flags.checkpoint);
// Spawn and initialize the isolation module.
// TODO(benh): Seems like the isolation module should really be
@@ -409,18 +410,6 @@ void Slave::finalize()
// TODO(vinod): Wait until all the executors have terminated.
terminate(isolationModule);
wait(isolationModule);
-
- // We send an unregister message to the master here, so that it can
- // remove the slave. This is important because lot of our tests terminate()
- // the slave and expect the master to remove the slave as a consequence.
- // But since the master no longer removes the slave when a slave exits, we
- // send an UnregisterSlaveMessage to master so that it removes the slave.
- // This is OK because, finalize() is only ever going to be called in tests!
- if (master && info.has_id()) {
- UnregisterSlaveMessage message;
- message.mutable_slave_id()->CopyFrom(info.id());
- send(master, message);
- }
}
Modified: incubator/mesos/trunk/src/tests/slave_recovery_tests.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/tests/slave_recovery_tests.cpp?rev=1456712&r1=1456711&r2=1456712&view=diff
==============================================================================
--- incubator/mesos/trunk/src/tests/slave_recovery_tests.cpp (original)
+++ incubator/mesos/trunk/src/tests/slave_recovery_tests.cpp Thu Mar 14 23:07:21 2013
@@ -153,13 +153,6 @@ public:
ASSERT_TRUE(GTEST_IS_THREADSAFE);
- // Always, drop the unregisterSlaveMessage sent by a slave when
- // it is terminated. This will stop the master from removing the
- // slave, which is what we expect to happen in the real world
- // when a slave exits.
- EXPECT_MESSAGE(Eq(UnregisterSlaveMessage().GetTypeName()), _, _)
- .WillRepeatedly(Return(true));
-
a = new Allocator(&allocator);
m = new Master(a, &files);
master = process::spawn(m);
@@ -174,10 +167,6 @@ public:
virtual void TearDown()
{
- // Wait for the executor to exit.
- EXPECT_MESSAGE(Eq(UnregisterSlaveMessage().GetTypeName()), _, _)
- .WillRepeatedly(Return(true));
-
stopSlave(true);
process::terminate(master);