You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2014/11/07 03:19:32 UTC

mesos git commit: Updated scheduler driver to exponentially backoff during registration retries.

Repository: mesos
Updated Branches:
  refs/heads/master 12d61403a -> d9f85d7f7


Updated scheduler driver to exponentially backoff during registration
retries.

Review: https://reviews.apache.org/r/27315


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/d9f85d7f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/d9f85d7f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/d9f85d7f

Branch: refs/heads/master
Commit: d9f85d7f7209cf24a4893156253909343ff12504
Parents: 12d6140
Author: Vinod Kone <vi...@gmail.com>
Authored: Thu Nov 6 18:19:11 2014 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Thu Nov 6 18:19:13 2014 -0800

----------------------------------------------------------------------
 src/Makefile.am                     |  3 ++
 src/sched/constants.cpp             | 36 ++++++++++++++++++++
 src/sched/constants.hpp             | 40 ++++++++++++++++++++++
 src/sched/flags.hpp                 | 55 ++++++++++++++++++++++++++++++
 src/sched/sched.cpp                 | 58 +++++++++++++++++++++++++++++---
 src/slave/slave.cpp                 | 18 +++++-----
 src/slave/slave.hpp                 |  2 +-
 src/tests/fault_tolerance_tests.cpp |  7 ++--
 8 files changed, 200 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/Makefile.am
----------------------------------------------------------------------
diff --git a/src/Makefile.am b/src/Makefile.am
index 9ab3b9c..443554a 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -279,6 +279,7 @@ libmesos_no_3rdparty_la_SOURCES =					\
 	master/registrar.cpp						\
 	master/repairer.cpp						\
 	module/manager.cpp						\
+	sched/constants.cpp						\
 	sched/sched.cpp							\
 	scheduler/scheduler.cpp						\
 	slave/constants.cpp						\
@@ -431,6 +432,8 @@ libmesos_no_3rdparty_la_SOURCES +=					\
 	module/authenticator.hpp					\
 	module/isolator.hpp						\
 	module/manager.hpp						\
+	sched/constants.hpp						\
+	sched/flags.hpp							\
 	slave/constants.hpp						\
 	slave/flags.hpp							\
 	slave/gc.hpp							\

http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/sched/constants.cpp
----------------------------------------------------------------------
diff --git a/src/sched/constants.cpp b/src/sched/constants.cpp
new file mode 100644
index 0000000..44ccfbe
--- /dev/null
+++ b/src/sched/constants.cpp
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sched/constants.hpp"
+
+namespace mesos {
+namespace internal {
+namespace scheduler {
+
+// NOTE: The default backoff factor for the scheduler (2s) is
+// different from the slave (1s) because the scheduler driver doesn't
+// do an initial backoff for the very first attempt unlike the slave.
+// TODO(vinod): Once we fix the scheduler driver to do initial backoff
+// we can change the default to 1s.
+const Duration REGISTRATION_BACKOFF_FACTOR = Seconds(2);
+
+const Duration REGISTRATION_RETRY_INTERVAL_MAX = Minutes(1);
+
+} // namespace scheduler {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/sched/constants.hpp
----------------------------------------------------------------------
diff --git a/src/sched/constants.hpp b/src/sched/constants.hpp
new file mode 100644
index 0000000..63707a8
--- /dev/null
+++ b/src/sched/constants.hpp
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SCHED_CONSTANTS_HPP__
+#define __SCHED_CONSTANTS_HPP__
+
+#include <stout/duration.hpp>
+
+namespace mesos {
+namespace internal {
+namespace scheduler {
+
+// Default backoff interval used by the scheduler driver to wait
+// before registration.
+extern const Duration REGISTRATION_BACKOFF_FACTOR;
+
+// The maximum interval the scheduler driver waits before retrying
+// registration.
+extern const Duration REGISTRATION_RETRY_INTERVAL_MAX;
+
+} // namespace scheduler {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __SCHED_CONSTANTS_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/sched/flags.hpp
----------------------------------------------------------------------
diff --git a/src/sched/flags.hpp b/src/sched/flags.hpp
new file mode 100644
index 0000000..62a634b
--- /dev/null
+++ b/src/sched/flags.hpp
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SCHED_FLAGS_HPP__
+#define __SCHED_FLAGS_HPP__
+
+#include <stout/flags.hpp>
+
+#include "logging/flags.hpp"
+
+#include "sched/constants.hpp"
+
+namespace mesos {
+namespace internal {
+namespace scheduler {
+
+class Flags : public logging::Flags
+{
+public:
+  Flags()
+  {
+    add(&Flags::registration_backoff_factor,
+        "registration_backoff_factor",
+        "Scheduler driver (re-)registration retries are exponentially backed\n"
+        "off based on 'b', the registration backoff factor (e.g., 1st retry\n"
+        "uses a random value between [0, b], 2nd retry between [0, b * 2^1],\n"
+        "3rd retry between [0, b * 2^2]...) up to a maximum of (framework\n"
+        "failover timeout/10, if failover timeout is specified) or " +
+        stringify(REGISTRATION_RETRY_INTERVAL_MAX) + ", whichever is smaller",
+        REGISTRATION_BACKOFF_FACTOR);
+  }
+
+  Duration registration_backoff_factor;
+};
+
+} // namespace scheduler {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __SCHED_FLAGS_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/sched/sched.cpp
----------------------------------------------------------------------
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index e5f828d..8ca0526 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -64,6 +64,7 @@
 #include "common/lock.hpp"
 #include "common/type_utils.hpp"
 
+#include "local/flags.hpp"
 #include "local/local.hpp"
 
 #include "logging/flags.hpp"
@@ -73,6 +74,9 @@
 
 #include "messages/messages.hpp"
 
+#include "sched/constants.hpp"
+#include "sched/flags.hpp"
+
 using namespace mesos;
 using namespace mesos::internal;
 using namespace mesos::internal::master;
@@ -105,6 +109,7 @@ public:
                    const Option<Credential>& _credential,
                    const string& schedulerId,
                    MasterDetector* _detector,
+                   const internal::scheduler::Flags& _flags,
                    pthread_mutex_t* _mutex,
                    pthread_cond_t* _cond)
       // We use a UUID here to ensure that the master can reliably
@@ -128,6 +133,7 @@ public:
       connected(false),
       aborted(false),
       detector(_detector),
+      flags(_flags),
       credential(_credential),
       authenticatee(NULL),
       authenticating(None()),
@@ -235,13 +241,19 @@ protected:
 
       if (credential.isSome()) {
         // Authenticate with the master.
+        // TODO(vinod): Do a backoff for authentication similar to what
+        // we do for registration.
         authenticate();
       } else {
         // Proceed with registration without authentication.
         LOG(INFO) << "No credentials provided."
                   << " Attempting to register without authentication";
 
-        doReliableRegistration();
+        // TODO(vinod): Similar to the slave add a random delay to the
+        // first registration attempt too. This needs fixing tests
+        // that expect scheduler to register even with clock paused
+        // (e.g., rate limiting tests).
+        doReliableRegistration(flags.registration_backoff_factor);
       }
     } else {
       // In this case, we don't actually invoke Scheduler::error
@@ -359,7 +371,7 @@ protected:
     authenticated = true;
     authenticating = None();
 
-    doReliableRegistration(); // Proceed with registration.
+    doReliableRegistration(flags.registration_backoff_factor);
   }
 
   void authenticationTimeout(Future<bool> future)
@@ -463,7 +475,7 @@ protected:
     VLOG(1) << "Scheduler::reregistered took " << stopwatch.elapsed();
   }
 
-  void doReliableRegistration()
+  void doReliableRegistration(Duration maxBackoff)
   {
     if (connected || master.isNone()) {
       return;
@@ -488,7 +500,29 @@ protected:
       send(master.get(), message);
     }
 
-    delay(Seconds(1), self(), &Self::doReliableRegistration);
+    // Bound the maximum backoff by 'REGISTRATION_RETRY_INTERVAL_MAX'.
+    maxBackoff =
+      std::min(maxBackoff, scheduler::REGISTRATION_RETRY_INTERVAL_MAX);
+
+    // If failover timeout is present, bound the maximum backoff
+    // by 1/10th of the failover timeout.
+    if (framework.has_failover_timeout()) {
+      Try<Duration> duration = Duration::create(framework.failover_timeout());
+      if (duration.isSome()) {
+        maxBackoff = std::min(maxBackoff, duration.get() / 10);
+      }
+    }
+
+    // Determine the delay for next attempt by picking a random
+    // duration between 0 and 'maxBackoff'.
+    // TODO(vinod): Use random numbers from <random> header.
+    Duration delay = maxBackoff * ((double) ::random() / RAND_MAX);
+
+    VLOG(1) << "Will retry registration in " << delay << " if necessary";
+
+    // Backoff.
+    process::delay(
+        delay, self(), &Self::doReliableRegistration, maxBackoff * 2);
   }
 
   void resourceOffers(
@@ -1021,6 +1055,8 @@ private:
 
   MasterDetector* detector;
 
+  const internal::scheduler::Flags flags;
+
   hashmap<OfferID, hashmap<SlaveID, UPID> > savedOffers;
   hashmap<SlaveID, UPID> savedSlavePids;
 
@@ -1051,7 +1087,6 @@ void MesosSchedulerDriver::initialize() {
   // we'll probably want a way to load master::Flags and slave::Flags
   // as well.
   local::Flags flags;
-
   Try<Nothing> load = flags.load("MESOS_");
 
   if (load.isError()) {
@@ -1116,6 +1151,7 @@ void MesosSchedulerDriver::initialize() {
   url = pid.isSome() ? static_cast<string>(pid.get()) : master;
 }
 
+
 // Implementation of C++ API.
 //
 // Notes:
@@ -1230,6 +1266,16 @@ Status MesosSchedulerDriver::start()
     detector = detector_.get();
   }
 
+  // Load scheduler flags.
+  internal::scheduler::Flags flags;
+  Try<Nothing> load = flags.load("MESOS_");
+
+  if (load.isError()) {
+    status = DRIVER_ABORTED;
+    scheduler->error(this, load.error());
+    return status;
+  }
+
   CHECK(process == NULL);
 
   if (credential == NULL) {
@@ -1240,6 +1286,7 @@ Status MesosSchedulerDriver::start()
         None(),
         schedulerId,
         detector,
+        flags,
         &mutex,
         &cond);
   } else {
@@ -1251,6 +1298,7 @@ Status MesosSchedulerDriver::start()
         cred,
         schedulerId,
         detector,
+        flags,
         &mutex,
         &cond);
   }

http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index dbfd1a8..81e0c4b 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -906,7 +906,7 @@ void Slave::reregistered(
 }
 
 
-void Slave::doReliableRegistration(const Duration& duration)
+void Slave::doReliableRegistration(Duration maxBackoff)
 {
   if (master.isNone()) {
     LOG(INFO) << "Skipping registration because no master present";
@@ -1040,19 +1040,17 @@ void Slave::doReliableRegistration(const Duration& duration)
     send(master.get(), message);
   }
 
-  // Retry registration if necessary.
-  Duration next = std::min(
-      duration * ((double) ::random() / RAND_MAX),
-      REGISTER_RETRY_INTERVAL_MAX);
+  // Bound the maximum backoff by 'REGISTER_RETRY_INTERVAL_MAX'.
+  maxBackoff = std::min(maxBackoff, REGISTER_RETRY_INTERVAL_MAX);
 
-  Duration duration_ = std::min(
-      duration * 2,
-      REGISTER_RETRY_INTERVAL_MAX);
+  // Determine the delay for next attempt by picking a random
+  // duration between 0 and 'maxBackoff'.
+  Duration delay = maxBackoff * ((double) ::random() / RAND_MAX);
 
-  VLOG(1) << "Will retry registration in " << next << " if necessary";
+  VLOG(1) << "Will retry registration in " << delay << " if necessary";
 
   // Backoff.
-  delay(next, self(), &Slave::doReliableRegistration, duration_);
+  process::delay(delay, self(), &Slave::doReliableRegistration, maxBackoff * 2);
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index 5b082fc..72bbec9 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -104,7 +104,7 @@ public:
       const SlaveID& slaveId,
       const std::vector<ReconcileTasksMessage>& reconciliations);
 
-  void doReliableRegistration(const Duration& duration);
+  void doReliableRegistration(Duration maxBackoff);
 
   // Made 'virtual' for Slave mocking.
   virtual void runTask(

http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/tests/fault_tolerance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp
index 372c4fd..5baeda6 100644
--- a/src/tests/fault_tolerance_tests.cpp
+++ b/src/tests/fault_tolerance_tests.cpp
@@ -40,6 +40,8 @@
 #include "master/allocator.hpp"
 #include "master/master.hpp"
 
+#include "sched/constants.hpp"
+
 #include "slave/constants.hpp"
 #include "slave/slave.hpp"
 
@@ -803,7 +805,7 @@ TEST_F(FaultToleranceTest, SchedulerFailoverRetriedReregistration)
   AWAIT_READY(reregistrationMessage);
 
   // Trigger the re-registration retry.
-  Clock::advance(Seconds(1));
+  Clock::advance(internal::scheduler::REGISTRATION_BACKOFF_FACTOR);
 
   AWAIT_READY(sched2Registered);
 
@@ -856,9 +858,8 @@ TEST_F(FaultToleranceTest, FrameworkReliableRegistration)
 
   AWAIT_READY(frameworkRegisteredMessage);
 
-  // TODO(benh): Pull out constant from SchedulerProcess.
   Clock::pause();
-  Clock::advance(Seconds(1));
+  Clock::advance(internal::scheduler::REGISTRATION_BACKOFF_FACTOR);
 
   AWAIT_READY(registered); // Ensures registered message is received.