You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gr...@apache.org on 2018/07/10 15:59:51 UTC

[1/2] mesos git commit: Prevented master from asking agents to shutdown on auth failures.

Repository: mesos
Updated Branches:
  refs/heads/master 71d347d73 -> fc5fcfd05


Prevented master from asking agents to shutdown on auth failures.

The Mesos master sends a `ShutdownMessage` to an agent if there is an
authentication or an authorization error during agent (re)registration.

Upon receipt of this message, the agent kills alls its tasks and commits
suicide. This means that transient auth errors can lead to whole agents
being killed along with its tasks.

This patch prevents the master from sending a `ShutdownMessage` in these
cases.

Review: https://reviews.apache.org/r/67791/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e4ea236d
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e4ea236d
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e4ea236d

Branch: refs/heads/master
Commit: e4ea236dae9208186f5858547c958f8c08432f51
Parents: 71d347d
Author: Gastón Kleiman <ga...@mesosphere.io>
Authored: Tue Jul 10 08:04:15 2018 -0700
Committer: Greg Mann <gr...@gmail.com>
Committed: Tue Jul 10 08:20:09 2018 -0700

----------------------------------------------------------------------
 src/master/master.cpp                    |  16 ----
 src/tests/authentication_tests.cpp       |  33 ++++++--
 src/tests/master_authorization_tests.cpp | 105 ++++++++++++++++++++------
 3 files changed, 111 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/e4ea236d/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 0c0d6ca..e14f7de 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -6704,10 +6704,6 @@ void Master::registerSlave(
     // without authentication.
     LOG(WARNING) << "Refusing registration of agent at " << from
                  << " because it is not authenticated";
-
-    ShutdownMessage message;
-    message.set_message("Agent is not authenticated");
-    send(from, message);
     return;
   }
 
@@ -6793,10 +6789,6 @@ void Master::_registerSlave(
                  << " (" << slaveInfo.hostname() << ")"
                  << ": " << authorizationError.get();
 
-    ShutdownMessage message;
-    message.set_message(authorizationError.get());
-    send(pid, message);
-
     slaves.registering.erase(pid);
     return;
   }
@@ -7040,10 +7032,6 @@ void Master::reregisterSlave(
     // reregister without authentication.
     LOG(WARNING) << "Refusing re-registration of agent at " << from
                  << " because it is not authenticated";
-
-    ShutdownMessage message;
-    message.set_message("Agent is not authenticated");
-    send(from, message);
     return;
   }
 
@@ -7158,10 +7146,6 @@ void Master::_reregisterSlave(
                  << " at " << pid << " (" << slaveInfo.hostname() << ")"
                  << ": " << authorizationError.get();
 
-    ShutdownMessage message;
-    message.set_message(authorizationError.get());
-    send(pid, message);
-
     slaves.reregistering.erase(slaveInfo.id());
     return;
   }

http://git-wip-us.apache.org/repos/asf/mesos/blob/e4ea236d/src/tests/authentication_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/authentication_tests.cpp b/src/tests/authentication_tests.cpp
index bd46cbc..c9a8f85 100644
--- a/src/tests/authentication_tests.cpp
+++ b/src/tests/authentication_tests.cpp
@@ -79,14 +79,21 @@ TEST_F(AuthenticationTest, UnauthenticatedFramework)
 
 
 // This test verifies that an unauthenticated slave is
-// denied registration by the master.
+// denied registration by the master, but not shut down.
 TEST_F(AuthenticationTest, UnauthenticatedSlave)
 {
   Try<Owned<cluster::Master>> master = StartMaster();
   ASSERT_SOME(master);
 
-  Future<ShutdownMessage> shutdownMessage =
-    FUTURE_PROTOBUF(ShutdownMessage(), _, _);
+  // Previously, agents were shut down when registration failed due to
+  // authorization. We verify that this no longer occurs.
+  EXPECT_NO_FUTURE_PROTOBUFS(ShutdownMessage(), _, _);
+
+  // We verify that the agent isn't allowed to register.
+  EXPECT_NO_FUTURE_PROTOBUFS(SlaveRegisteredMessage(), _, _);
+
+  Future<RegisterSlaveMessage> registerSlaveMessage1 =
+    FUTURE_PROTOBUF(RegisterSlaveMessage(), _, _);
 
   // Start the slave without credentials.
   slave::Flags flags = CreateSlaveFlags();
@@ -96,9 +103,23 @@ TEST_F(AuthenticationTest, UnauthenticatedSlave)
   Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags);
   ASSERT_SOME(slave);
 
-  // Slave should get error message from the master.
-  AWAIT_READY(shutdownMessage);
-  ASSERT_NE("", shutdownMessage->message());
+  AWAIT_READY(registerSlaveMessage1);
+
+  Future<RegisterSlaveMessage> registerSlaveMessage2 =
+    FUTURE_PROTOBUF(RegisterSlaveMessage(), _, _);
+
+  // Advance the clock to trigger another registration attempt.
+  Clock::pause();
+  Clock::advance(slave::REGISTER_RETRY_INTERVAL_MAX);
+  Clock::settle();
+  Clock::resume();
+
+  AWAIT_READY(registerSlaveMessage2);
+
+  // Settle to make sure neither `SlaveRegisteredMessage` nor
+  // `ShutdownMessage` are sent.
+  Clock::pause();
+  Clock::settle();
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/e4ea236d/src/tests/master_authorization_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_authorization_tests.cpp b/src/tests/master_authorization_tests.cpp
index 80b9d49..f65b621 100644
--- a/src/tests/master_authorization_tests.cpp
+++ b/src/tests/master_authorization_tests.cpp
@@ -2390,10 +2390,12 @@ TEST_F(MasterAuthorizationTest, AuthorizedToRegisterAndReregisterAgent)
 }
 
 
-// This test verifies that the agent is shut down by the master if
-// it is not authorized to register.
+// This test verifies that an agent without the right ACLs
+// is not allowed to register and is not shut down.
 TEST_F(MasterAuthorizationTest, UnauthorizedToRegisterAgent)
 {
+  Clock::pause();
+
   // Set up ACLs that disallows the agent's principal to register.
   ACLs acls;
   mesos::ACL::RegisterAgent* acl = acls.add_register_agents();
@@ -2406,14 +2408,31 @@ TEST_F(MasterAuthorizationTest, UnauthorizedToRegisterAgent)
   Try<Owned<cluster::Master>> master = StartMaster(flags);
   ASSERT_SOME(master);
 
-  Future<Message> shutdownMessage =
-    FUTURE_MESSAGE(Eq(ShutdownMessage().GetTypeName()), _, _);
+  // Previously, agents were shut down when registration failed due to
+  // authorization. We verify that this no longer occurs.
+  EXPECT_NO_FUTURE_PROTOBUFS(ShutdownMessage(), _, _);
+
+  // We verify that the agent isn't allowed to register.
+  EXPECT_NO_FUTURE_PROTOBUFS(SlaveRegisteredMessage(), _, _);
+
+  Future<RegisterSlaveMessage> registerSlaveMessage =
+    FUTURE_PROTOBUF(RegisterSlaveMessage(), _, _);
 
   Owned<MasterDetector> detector = master.get()->createDetector();
-  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get());
+  slave::Flags slaveFlags = CreateSlaveFlags();
+
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), slaveFlags);
   ASSERT_SOME(slave);
 
-  AWAIT_READY(shutdownMessage);
+  // Advance the clock to trigger the registration attempt.
+  Clock::advance(slaveFlags.registration_backoff_factor);
+  Clock::settle();
+
+  AWAIT_READY(registerSlaveMessage);
+
+  // Settle to make sure neither `SlaveRegisteredMessage` nor
+  // `ShutdownMessage` are sent.
+  Clock::settle();
 }
 
 
@@ -2433,12 +2452,14 @@ TEST_F(MasterAuthorizationTest, UnauthorizedToReregisterAgent)
   Try<Owned<cluster::Master>> master = StartMaster(flags);
   ASSERT_SOME(master);
 
-  slave::Flags slaveFlags = CreateSlaveFlags();
-  StandaloneMasterDetector detector(master.get()->pid);
+  // Previously, agents were shut down when registration failed due to
+  // authorization. We verify that this no longer occurs.
+  EXPECT_NO_FUTURE_PROTOBUFS(ShutdownMessage(), _, _);
 
-  Future<Message> slaveRegisteredMessage =
-    FUTURE_MESSAGE(Eq(SlaveRegisteredMessage().GetTypeName()), _, _);
+  Future<SlaveRegisteredMessage> slaveRegisteredMessage =
+    FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
 
+  StandaloneMasterDetector detector(master.get()->pid);
   Try<Owned<cluster::Slave>> slave = StartSlave(&detector);
   ASSERT_SOME(slave);
 
@@ -2451,15 +2472,23 @@ TEST_F(MasterAuthorizationTest, UnauthorizedToReregisterAgent)
   acl->mutable_agents()->set_type(ACL::Entity::NONE);
   flags.acls = acls;
 
-  Future<Message> shutdownMessage =
-    FUTURE_MESSAGE(Eq(ShutdownMessage().GetTypeName()), _, _);
+  // The agent should try but not be able to reregister.
+  Future<ReregisterSlaveMessage> reregisterSlaveMessage =
+    FUTURE_PROTOBUF(ReregisterSlaveMessage(), _, _);
+
+  EXPECT_NO_FUTURE_PROTOBUFS(SlaveReregisteredMessage(), _, _);
 
   master = StartMaster(flags);
   ASSERT_SOME(master);
 
   detector.appoint(master.get()->pid);
 
-  AWAIT_READY(shutdownMessage);
+  AWAIT_READY(reregisterSlaveMessage);
+
+  // Settle to make sure neither `SlaveReregisteredMessage` nor
+  // `ShutdownMessage` are sent.
+  Clock::pause();
+  Clock::settle();
 }
 
 
@@ -2512,10 +2541,12 @@ TEST_F(MasterAuthorizationTest, RetryRegisterAgent)
 }
 
 
-// This test verifies that the agent is shut down by the master if it is
-// not authorized to statically reserve any resources.
+// This test verifies that the agent is not allowed to register if it tries to
+// statically reserve resources, but it is not allowed to.
 TEST_F(MasterAuthorizationTest, UnauthorizedToStaticallyReserveResources)
 {
+  Clock::pause();
+
   // Set up ACLs so that the agent can (re)register.
   ACLs acls;
 
@@ -2538,8 +2569,15 @@ TEST_F(MasterAuthorizationTest, UnauthorizedToStaticallyReserveResources)
   Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
   ASSERT_SOME(master);
 
-  Future<Message> shutdownMessage =
-    FUTURE_MESSAGE(Eq(ShutdownMessage().GetTypeName()), _, _);
+  // Previously, agents were shut down when registration failed due to
+  // authorization. We verify that this no longer occurs.
+  EXPECT_NO_FUTURE_PROTOBUFS(ShutdownMessage(), _, _);
+
+  // We verify that the agent isn't allowed to register.
+  EXPECT_NO_FUTURE_PROTOBUFS(SlaveRegisteredMessage(), _, _);
+
+  Future<RegisterSlaveMessage> registerSlaveMessage =
+    FUTURE_PROTOBUF(RegisterSlaveMessage(), _, _);
 
   Owned<MasterDetector> detector = master.get()->createDetector();
 
@@ -2549,7 +2587,15 @@ TEST_F(MasterAuthorizationTest, UnauthorizedToStaticallyReserveResources)
   Try<Owned<cluster::Slave>> agent = StartSlave(detector.get(), agentFlags);
   ASSERT_SOME(agent);
 
-  AWAIT_READY(shutdownMessage);
+  // Advance the clock to trigger the registration attempt.
+  Clock::advance(agentFlags.registration_backoff_factor);
+  Clock::settle();
+
+  AWAIT_READY(registerSlaveMessage);
+
+  // Settle to make sure neither `SlaveRegisteredMessage` nor
+  // `ShutdownMessage` are sent.
+  Clock::settle();
 }
 
 
@@ -2557,6 +2603,8 @@ TEST_F(MasterAuthorizationTest, UnauthorizedToStaticallyReserveResources)
 // not authorized to statically reserve certain roles.
 TEST_F(MasterAuthorizationTest, UnauthorizedToStaticallyReserveRole)
 {
+  Clock::pause();
+
   // Set up ACLs so that the agent can (re)register.
   ACLs acls;
 
@@ -2587,8 +2635,15 @@ TEST_F(MasterAuthorizationTest, UnauthorizedToStaticallyReserveRole)
   Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
   ASSERT_SOME(master);
 
-  Future<Message> shutdownMessage =
-    FUTURE_MESSAGE(Eq(ShutdownMessage().GetTypeName()), _, _);
+  // Previously, agents were shut down when registration failed due to
+  // authorization. We verify that this no longer occurs.
+  EXPECT_NO_FUTURE_PROTOBUFS(ShutdownMessage(), _, _);
+
+  // We verify that the agent isn't allowed to register.
+  EXPECT_NO_FUTURE_PROTOBUFS(SlaveRegisteredMessage(), _, _);
+
+  Future<RegisterSlaveMessage> registerSlaveMessage =
+    FUTURE_PROTOBUF(RegisterSlaveMessage(), _, _);
 
   Owned<MasterDetector> detector = master.get()->createDetector();
 
@@ -2598,7 +2653,15 @@ TEST_F(MasterAuthorizationTest, UnauthorizedToStaticallyReserveRole)
   Try<Owned<cluster::Slave>> agent = StartSlave(detector.get(), agentFlags);
   ASSERT_SOME(agent);
 
-  AWAIT_READY(shutdownMessage);
+  // Advance the clock to trigger the registration attempt.
+  Clock::advance(agentFlags.registration_backoff_factor);
+  Clock::settle();
+
+  AWAIT_READY(registerSlaveMessage);
+
+  // Settle to make sure neither `SlaveRegisteredMessage` nor
+  // `ShutdownMessage` are sent.
+  Clock::settle();
 }
 
 


[2/2] mesos git commit: Improved logging for offers and inverse offers.

Posted by gr...@apache.org.
Improved logging for offers and inverse offers.

Log offer IDs and inverse offer IDs when sending out offers and
inverse offers so it is easier to match them to their ACCEPT or DECLINE
calls and removals.

Also log at `VLOG(2)` level which resources are offered.

NOTE: It is possible to enable `VLOG(2)` logs just for `master.cpp` by
setting the following env variable when starting the master:
`GLOG_vmodule=master=2`.

Review: https://reviews.apache.org/r/67817/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/fc5fcfd0
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/fc5fcfd0
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/fc5fcfd0

Branch: refs/heads/master
Commit: fc5fcfd05520b1459b883e088250bf527a6a564e
Parents: e4ea236
Author: Gastón Kleiman <ga...@mesosphere.io>
Authored: Tue Jul 10 08:06:32 2018 -0700
Committer: Greg Mann <gr...@gmail.com>
Committed: Tue Jul 10 08:20:24 2018 -0700

----------------------------------------------------------------------
 src/master/master.cpp | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/fc5fcfd0/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index e14f7de..487ee34 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -9417,6 +9417,9 @@ void Master::offer(
   // and a single allocation role.
   ResourceOffersMessage message;
 
+  // We keep track of the offer IDs so that we can log them.
+  vector<OfferID> offerIds;
+
   foreachkey (const string& role, resources) {
     foreachpair (const SlaveID& slaveId,
                  const Resources& offered,
@@ -9562,6 +9565,13 @@ void Master::offer(
       // Add the offer *AND* the corresponding slave's PID.
       message.add_offers()->MergeFrom(offer_);
       message.add_pids(slave->pid);
+
+      offerIds.push_back(offer_.id());
+
+      VLOG(2) << "Sending offer " << offer_.id()
+              << " containing resources " << offered
+              << " on agent " << *slave
+              << " to framework " << *framework;
     }
   }
 
@@ -9569,8 +9579,7 @@ void Master::offer(
     return;
   }
 
-  LOG(INFO) << "Sending " << message.offers().size()
-            << " offers to framework " << *framework;
+  LOG(INFO) << "Sending offers " << offerIds << " to framework " << *framework;
 
   framework->send(message);
 }
@@ -9672,8 +9681,13 @@ void Master::inverseOffer(
     return;
   }
 
-  LOG(INFO) << "Sending " << message.inverse_offers().size()
-            << " inverse offers to framework " << *framework;
+  vector<OfferID> inverseOfferIds;
+  foreach (const InverseOffer& inverseOffer, message.inverse_offers()) {
+    inverseOfferIds.push_back(inverseOffer.id());
+  }
+
+  LOG(INFO) << "Sending inverse offers " << inverseOfferIds << " to framework "
+            << *framework;
 
   framework->send(message);
 }