You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2013/03/15 00:07:22 UTC

svn commit: r1456712 - in /incubator/mesos/trunk: include/mesos/mesos.proto src/common/type_utils.hpp src/master/master.cpp src/slave/slave.cpp src/tests/slave_recovery_tests.cpp

Author: vinodkone
Date: Thu Mar 14 23:07:21 2013
New Revision: 1456712

URL: http://svn.apache.org/r1456712
Log:
Added checkpoint field to slave info and fixed master to remove
slaves that have disabled checkpointing.

Review: https://reviews.apache.org/r/9927

Modified:
    incubator/mesos/trunk/include/mesos/mesos.proto
    incubator/mesos/trunk/src/common/type_utils.hpp
    incubator/mesos/trunk/src/master/master.cpp
    incubator/mesos/trunk/src/slave/slave.cpp
    incubator/mesos/trunk/src/tests/slave_recovery_tests.cpp

Modified: incubator/mesos/trunk/include/mesos/mesos.proto
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/include/mesos/mesos.proto?rev=1456712&r1=1456711&r2=1456712&view=diff
==============================================================================
--- incubator/mesos/trunk/include/mesos/mesos.proto (original)
+++ incubator/mesos/trunk/include/mesos/mesos.proto Thu Mar 14 23:07:21 2013
@@ -166,6 +166,9 @@ message MasterInfo {
  * hostnames (e.g., Amazon EC2). Note that the 'id' field is only
  * available after a slave is registered with the master, and is made
  * available here to facilitate re-registration.
+ * If checkpoint is set, the slave is checkpointing its own
+ * information and potentially frameworks' information (if a
+ * framework has checkpointing enabled).
  */
 message SlaveInfo {
   required string hostname = 1;
@@ -175,6 +178,7 @@ message SlaveInfo {
   repeated Resource resources = 3;
   repeated Attribute attributes = 5;
   optional SlaveID id = 6;
+  optional bool checkpoint = 7 [default = false];
 }
 
 

Modified: incubator/mesos/trunk/src/common/type_utils.hpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/common/type_utils.hpp?rev=1456712&r1=1456711&r2=1456712&view=diff
==============================================================================
--- incubator/mesos/trunk/src/common/type_utils.hpp (original)
+++ incubator/mesos/trunk/src/common/type_utils.hpp Thu Mar 14 23:07:21 2013
@@ -273,7 +273,9 @@ inline bool operator == (const SlaveInfo
     left.has_webui_port() == right.has_webui_port() &&
     (!left.has_webui_port() || (left.webui_port() == right.webui_port())) &&
     left.has_id() == right.has_id() &&
-    (!left.has_id() || (left.id() == right.id()));
+    (!left.has_id() || (left.id() == right.id())) &&
+    left.has_checkpoint() == right.has_checkpoint() &&
+    (!left.has_checkpoint() || (left.checkpoint() == right.checkpoint()));
 }
 
 

Modified: incubator/mesos/trunk/src/master/master.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/master/master.cpp?rev=1456712&r1=1456711&r2=1456712&view=diff
==============================================================================
--- incubator/mesos/trunk/src/master/master.cpp (original)
+++ incubator/mesos/trunk/src/master/master.cpp Thu Mar 14 23:07:21 2013
@@ -515,6 +515,24 @@ void Master::exited(const UPID& pid)
       return;
     }
   }
+
+  foreachvalue (Slave* slave, slaves) {
+    if (slave->pid == pid) {
+      LOG(INFO) << "Slave " << slave->id << "(" << slave->info.hostname()
+                << ") disconnected";
+
+      // Remove the slave, if it is not checkpointing.
+      // TODO(vinod): Even if a slave is checkpointing, transition all
+      // tasks of frameworks that have disabled checkpointing.
+      if (!slave->info.checkpoint()) {
+        LOG(INFO) << "Removing disconnected slave " << slave->id
+                  << "(" << slave->info.hostname() << ") "
+                  << "because it is not checkpointing!";
+        removeSlave(slave);
+        return;
+      }
+    }
+  }
 }
 
 

Modified: incubator/mesos/trunk/src/slave/slave.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/slave/slave.cpp?rev=1456712&r1=1456711&r2=1456712&view=diff
==============================================================================
--- incubator/mesos/trunk/src/slave/slave.cpp (original)
+++ incubator/mesos/trunk/src/slave/slave.cpp Thu Mar 14 23:07:21 2013
@@ -225,6 +225,7 @@ void Slave::initialize()
   info.set_webui_hostname(webui_hostname); // Deprecated!
   info.mutable_resources()->MergeFrom(resources);
   info.mutable_attributes()->MergeFrom(attributes);
+  info.set_checkpoint(flags.checkpoint);
 
   // Spawn and initialize the isolation module.
   // TODO(benh): Seems like the isolation module should really be
@@ -409,18 +410,6 @@ void Slave::finalize()
   // TODO(vinod): Wait until all the executors have terminated.
   terminate(isolationModule);
   wait(isolationModule);
-
-  // We send an unregister message to the master here, so that it can
-  // remove the slave. This is important because lot of our tests terminate()
-  // the slave and expect the master to remove the slave as a consequence.
-  // But since the master no longer removes the slave when a slave exits, we
-  // send an UnregisterSlaveMessage to master so that it removes the slave.
-  // This is OK because, finalize() is only ever going to be called in tests!
-  if (master && info.has_id()) {
-    UnregisterSlaveMessage message;
-    message.mutable_slave_id()->CopyFrom(info.id());
-    send(master, message);
-  }
 }
 
 

Modified: incubator/mesos/trunk/src/tests/slave_recovery_tests.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/tests/slave_recovery_tests.cpp?rev=1456712&r1=1456711&r2=1456712&view=diff
==============================================================================
--- incubator/mesos/trunk/src/tests/slave_recovery_tests.cpp (original)
+++ incubator/mesos/trunk/src/tests/slave_recovery_tests.cpp Thu Mar 14 23:07:21 2013
@@ -153,13 +153,6 @@ public:
 
     ASSERT_TRUE(GTEST_IS_THREADSAFE);
 
-    // Always, drop the unregisterSlaveMessage sent by a slave when
-    // it is terminated. This will stop the master from removing the
-    // slave, which is what we expect to happen in the real world
-    // when a slave exits.
-    EXPECT_MESSAGE(Eq(UnregisterSlaveMessage().GetTypeName()), _, _)
-      .WillRepeatedly(Return(true));
-
     a = new Allocator(&allocator);
     m = new Master(a, &files);
     master = process::spawn(m);
@@ -174,10 +167,6 @@ public:
 
   virtual void TearDown()
   {
-    // Wait for the executor to exit.
-    EXPECT_MESSAGE(Eq(UnregisterSlaveMessage().GetTypeName()), _, _)
-      .WillRepeatedly(Return(true));
-
     stopSlave(true);
 
     process::terminate(master);