You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2018/08/16 21:37:16 UTC

[mesos] 01/04: Fixed a backoff overflow bug in scheduler authentication retry logic.

This is an automated email from the ASF dual-hosted git repository.

bmahler pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit f10b0af1cc50b484f9963db359b3be3721a4cb67
Author: Meng Zhu <mz...@mesosphere.io>
AuthorDate: Thu Aug 16 14:34:47 2018 -0700

    Fixed a backoff overflow bug in scheduler authentication retry logic.
    
    This patch fixed the backoff time calculation
    overflow bug described in MESOS-9147.
    
    The old approach times out an authentication request after
    `flags.authentication_timeout` and then retries after some
    backoff time. This is not optimal because, if the scheduler
    is going to backoff some time before retry, we might as well
    wait that long for the previous authentication request
    (instead of timeout early).
    
    This patch combines the authentication timeout and
    authentication retry backoff interval into a single
    wait time interval. Now scheduler will timeout the previous
    authentication request after the wait time interval and
    then immediately retry.
    
    Review: https://reviews.apache.org/r/68346/
---
 src/sched/constants.hpp |  8 +++--
 src/sched/flags.hpp     | 20 +++++++------
 src/sched/sched.cpp     | 77 +++++++++++++++++++++++--------------------------
 3 files changed, 52 insertions(+), 53 deletions(-)

diff --git a/src/sched/constants.hpp b/src/sched/constants.hpp
index 9edb25b..3233ef4 100644
--- a/src/sched/constants.hpp
+++ b/src/sched/constants.hpp
@@ -38,9 +38,11 @@ constexpr Duration DEFAULT_REGISTRATION_BACKOFF_FACTOR = Seconds(2);
 // registration.
 constexpr Duration REGISTRATION_RETRY_INTERVAL_MAX = Minutes(1);
 
-// The maximum interval the scheduler driver waits before retrying
-// authentication.
-constexpr Duration AUTHENTICATION_RETRY_INTERVAL_MAX = Minutes(1);
+// The maximum timeout used when the scheduler driver is authenticating with
+// the master.
+//
+// TODO(mzhu): Make this configurable.
+constexpr Duration AUTHENTICATION_TIMEOUT_MAX = Minutes(1);
 
 // Default backoff interval used by the scheduler to wait after failed
 // authentication.
diff --git a/src/sched/flags.hpp b/src/sched/flags.hpp
index 2492665..c799386 100644
--- a/src/sched/flags.hpp
+++ b/src/sched/flags.hpp
@@ -36,15 +36,6 @@ class Flags : public virtual logging::Flags
 public:
   Flags()
   {
-    add(&Flags::authentication_backoff_factor,
-        "authentication_backoff_factor",
-        "Scheduler driver authentication retries are exponentially backed\n"
-        "off based on 'b', the authentication backoff factor (e.g., 1st retry\n"
-        "uses a random value between `[0, b * 2^1]`, 2nd retry between\n"
-        "`[0, b * 2^2]`, 3rd retry between `[0, b * 2^3]`, etc up to a\n"
-        "maximum of " + stringify(AUTHENTICATION_RETRY_INTERVAL_MAX),
-        DEFAULT_AUTHENTICATION_BACKOFF_FACTOR);
-
     add(&Flags::registration_backoff_factor,
         "registration_backoff_factor",
         "Scheduler driver (re-)registration retries are exponentially backed\n"
@@ -119,6 +110,17 @@ public:
         "or load an alternate authenticatee module using MESOS_MODULES.",
         DEFAULT_AUTHENTICATEE);
 
+    add(&Flags::authentication_backoff_factor,
+        "authentication_backoff_factor",
+        "The scheduler will time out its authentication with the master based\n"
+        "on exponential backoff. The timeout will be randomly chosen within\n"
+        "`[authentication_timeout, authentication_timeout + factor*2^n]`\n"
+        "where `n` is the number of failed attempts. The maximum timeout\n"
+        "internal is capped at " + stringify(AUTHENTICATION_TIMEOUT_MAX) + ".\n"
+        "To tune these parameters, set the `--authentication_timeout` and\n"
+        "`--authentication_backoff_factor` flags.\n",
+        DEFAULT_AUTHENTICATION_BACKOFF_FACTOR);
+
     add(&Flags::authentication_timeout,
         "authentication_timeout",
         "Timeout after which authentication will be retried.",
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index 4de7622..c5b0595 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -345,7 +345,12 @@ protected:
         // Authenticate with the master.
         // TODO(adam-mesos): Consider adding an initial delay like we do for
         // slave registration, to combat thundering herds on master failover.
-        authenticate();
+        authenticate(
+            flags.authentication_timeout,
+            std::min(
+                flags.authentication_timeout +
+                  flags.authentication_backoff_factor * 2,
+                scheduler::AUTHENTICATION_TIMEOUT_MAX));
       } else {
         // Proceed with registration without authentication.
         LOG(INFO) << "No credentials provided."
@@ -368,7 +373,7 @@ protected:
       .onAny(defer(self(), &SchedulerProcess::detected, lambda::_1));
   }
 
-  void authenticate()
+  void authenticate(Duration minTimeout, Duration maxTimeout)
   {
     if (!running.load()) {
       VLOG(1) << "Ignoring authenticate because the driver is not running!";
@@ -414,6 +419,10 @@ protected:
       authenticatee = module.get();
     }
 
+    // We pick a random duration between `minTimeout` and `maxTimeout`.
+    Duration timeout = minTimeout + (maxTimeout - minTimeout) *
+                                      ((double)os::random() / RAND_MAX);
+
     // NOTE: We do not pass 'Owned<Authenticatee>' here because doing
     // so could make 'AuthenticateeProcess' responsible for deleting
     // 'Authenticatee' causing a deadlock because the destructor of
@@ -429,15 +438,19 @@ protected:
     // TODO(vinod): Consider using 'Shared' to 'Owned' upgrade.
     authenticating =
       authenticatee->authenticate(master->pid(), self(), credential.get())
-        .onAny(defer(self(), &Self::_authenticate));
+        .onAny(defer(self(), &Self::_authenticate, minTimeout, maxTimeout))
+        .after(timeout, [](Future<bool> future) {
+          // NOTE: Discarded future results in a retry in '_authenticate()'.
+          // This is a no-op if the future is already ready.
+          if (future.discard()) {
+            LOG(WARNING) << "Authentication timed out";
+          }
 
-    delay(flags.authentication_timeout,
-          self(),
-          &Self::authenticationTimeout,
-          authenticating.get());
+          return future;
+        });
   }
 
-  void _authenticate()
+  void _authenticate(Duration currentMinTimeout, Duration currentMaxTimeout)
   {
     if (!running.load()) {
       VLOG(1) << "Ignoring _authenticate because the driver is not running!";
@@ -471,24 +484,23 @@ protected:
       authenticating = None();
       reauthenticate = false;
 
-      ++failedAuthentications;
-
-      // Backoff.
-      // The backoff is a random duration in the interval [0, b * 2^N)
-      // where `b = authentication_backoff_factor` and `N` the number
-      // of failed authentication attempts. It is capped by
-      // `REGISTER_RETRY_INTERVAL_MAX`.
-      Duration backoff = flags.authentication_backoff_factor *
-                         std::pow(2, failedAuthentications);
-      backoff = std::min(backoff, scheduler::AUTHENTICATION_RETRY_INTERVAL_MAX);
+      // TODO(vinod): Add a limit on number of retries.
 
-      // Determine the delay for next attempt by picking a random
-      // duration between 0 and 'maxBackoff'.
-      // TODO(vinod): Use random numbers from <random> header.
-      backoff *= double(os::random()) / RAND_MAX;
+      // Grow the timeout range using exponential backoff:
+      //
+      //   [min, min + factor * 2^0]
+      //   [min, min + factor * 2^1]
+      //   ...
+      //   [min, min + factor * 2^N]
+      //   ...
+      //   [min, max] // Stop at max.
+      Duration maxTimeout =
+        currentMinTimeout + (currentMaxTimeout - currentMinTimeout) * 2;
+
+      authenticate(
+          currentMinTimeout,
+          std::min(maxTimeout, scheduler::AUTHENTICATION_TIMEOUT_MAX));
 
-      // TODO(vinod): Add a limit on number of retries.
-      delay(backoff, self(), &Self::authenticate);
       return;
     }
 
@@ -508,23 +520,6 @@ protected:
     doReliableRegistration(flags.registration_backoff_factor);
   }
 
-  void authenticationTimeout(Future<bool> future)
-  {
-    if (!running.load()) {
-      VLOG(1) << "Ignoring authentication timeout because "
-              << "the driver is not running!";
-      return;
-    }
-
-    // NOTE: Discarded future results in a retry in '_authenticate()'.
-    // Also note that a 'discard' here is safe even if another
-    // authenticator is in progress because this copy of the future
-    // corresponds to the original authenticator that started the timer.
-    if (future.discard()) { // This is a no-op if the future is already ready.
-      LOG(WARNING) << "Authentication timed out";
-    }
-  }
-
   void drop(const Event& event, const string& message)
   {
     // TODO(bmahler): Increment a metric.