You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by me...@apache.org on 2016/08/02 08:24:29 UTC

mesos git commit: Added agent and scheduler authentication backoff.

Repository: mesos
Updated Branches:
  refs/heads/0.27.x 0b63bc928 -> b62d391fc


Added agent and scheduler authentication backoff.

The backoff follows the existing pattern for backoff used during agent
registration where we backoff for some random time in an interval of
increasing length capped by `AUTHENTICATION_RETRY_INTERVAL_MAX`.

Review: https://reviews.apache.org/r/49308/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/b62d391f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/b62d391f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/b62d391f

Branch: refs/heads/0.27.x
Commit: b62d391fc5eb1a39fd0859f871e910e50c7286b0
Parents: 0b63bc9
Author: Benjamin Bannier <be...@mesosphere.io>
Authored: Wed Jul 6 11:09:04 2016 -0700
Committer: Adam B <ad...@mesosphere.io>
Committed: Mon Aug 1 23:33:38 2016 -0700

----------------------------------------------------------------------
 docs/configuration.md   | 14 ++++++++++++++
 src/sched/constants.cpp |  4 ++++
 src/sched/constants.hpp |  8 ++++++++
 src/sched/flags.hpp     | 10 ++++++++++
 src/sched/sched.cpp     | 33 ++++++++++++++++++++++++++++-----
 src/slave/constants.cpp |  2 ++
 src/slave/constants.hpp |  7 +++++++
 src/slave/flags.cpp     | 11 +++++++++++
 src/slave/flags.hpp     |  1 +
 src/slave/slave.cpp     | 29 +++++++++++++++++++++++------
 src/slave/slave.hpp     |  3 +++
 11 files changed, 111 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/b62d391f/docs/configuration.md
----------------------------------------------------------------------
diff --git a/docs/configuration.md b/docs/configuration.md
index 29a6b65..0998523 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -819,6 +819,20 @@ file:///path/to/file (where file contains one of the above)</code></pre>
   </tr>
   <tr>
     <td>
+      --authentication_backoff_factor=VALUE
+    </td>
+    <td>
+      After a failed authentication the agent picks a random amount of time between
+      <code>[0, b]</code>, where <code>b = authentication_backoff_factor</code>, to
+      authenticate with a new master. Subsequent retries are exponentially backed
+      off based on this interval (e.g., 1st retry uses a random value between
+      <code>[0, b * 2^1]</code>, 2nd retry between <code>[0, b * 2^2]</code>, 3rd
+      retry between <code>[0, b * 2^3]</code>, etc up to a maximum of 1mins
+      (default: 1secs)
+    </td>
+  </tr>
+  <tr>
+    <td>
       --[no-]cgroups_cpu_enable_pids_and_tids_count
     </td>
     <td>

http://git-wip-us.apache.org/repos/asf/mesos/blob/b62d391f/src/sched/constants.cpp
----------------------------------------------------------------------
diff --git a/src/sched/constants.cpp b/src/sched/constants.cpp
index c00b8ba..c4fc80f 100644
--- a/src/sched/constants.cpp
+++ b/src/sched/constants.cpp
@@ -29,6 +29,10 @@ const Duration DEFAULT_REGISTRATION_BACKOFF_FACTOR = Seconds(2);
 
 const Duration REGISTRATION_RETRY_INTERVAL_MAX = Minutes(1);
 
+const Duration AUTHENTICATION_RETRY_INTERVAL_MAX = Minutes(1);
+
+const Duration DEFAULT_AUTHENTICATION_BACKOFF_FACTOR = Seconds(1);
+
 const std::string DEFAULT_AUTHENTICATEE = "crammd5";
 
 } // namespace scheduler {

http://git-wip-us.apache.org/repos/asf/mesos/blob/b62d391f/src/sched/constants.hpp
----------------------------------------------------------------------
diff --git a/src/sched/constants.hpp b/src/sched/constants.hpp
index 523c6d9..6038fd1 100644
--- a/src/sched/constants.hpp
+++ b/src/sched/constants.hpp
@@ -31,6 +31,14 @@ extern const Duration DEFAULT_REGISTRATION_BACKOFF_FACTOR;
 // registration.
 extern const Duration REGISTRATION_RETRY_INTERVAL_MAX;
 
+// The maximum interval the scheduler driver waits before retrying
+// authentication.
+extern const Duration AUTHENTICATION_RETRY_INTERVAL_MAX;
+
+// Default backoff interval used by the scheduler to wait after failed
+// authentication.
+extern const Duration DEFAULT_AUTHENTICATION_BACKOFF_FACTOR;
+
 // Name of the default, CRAM-MD5 authenticatee.
 extern const std::string DEFAULT_AUTHENTICATEE;
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/b62d391f/src/sched/flags.hpp
----------------------------------------------------------------------
diff --git a/src/sched/flags.hpp b/src/sched/flags.hpp
index 57dbab8..318dbc8 100644
--- a/src/sched/flags.hpp
+++ b/src/sched/flags.hpp
@@ -36,6 +36,15 @@ class Flags : public logging::Flags
 public:
   Flags()
   {
+    add(&Flags::authentication_backoff_factor,
+        "authentication_backoff_factor",
+        "Scheduler driver authentication retries are exponentially backed\n"
+        "off based on 'b', the authentication backoff factor (e.g., 1st retry\n"
+        "uses a random value between `[0, b * 2^1]`, 2nd retry between\n"
+        "`[0, b * 2^2]`, 3rd retry between `[0, b * 2^3]`, etc up to a\n"
+        "maximum of " + stringify(AUTHENTICATION_RETRY_INTERVAL_MAX),
+        DEFAULT_AUTHENTICATION_BACKOFF_FACTOR);
+
     add(&Flags::registration_backoff_factor,
         "registration_backoff_factor",
         "Scheduler driver (re-)registration retries are exponentially backed\n"
@@ -101,6 +110,7 @@ public:
         DEFAULT_AUTHENTICATEE);
   }
 
+  Duration authentication_backoff_factor;
   Duration registration_backoff_factor;
   Option<Modules> modules;
   std::string authenticatee;

http://git-wip-us.apache.org/repos/asf/mesos/blob/b62d391f/src/sched/sched.cpp
----------------------------------------------------------------------
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index 5448d12..46b0691 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -23,12 +23,13 @@
 
 #include <arpa/inet.h>
 
+#include <cmath>
 #include <iostream>
 #include <map>
 #include <memory>
 #include <mutex>
-#include <string>
 #include <sstream>
+#include <string>
 
 #include <mesos/mesos.hpp>
 #include <mesos/module.hpp>
@@ -217,7 +218,8 @@ public:
       authenticatee(NULL),
       authenticating(None()),
       authenticated(false),
-      reauthenticate(false)
+      reauthenticate(false),
+      failedAuthentications(0)
   {
     LOG(INFO) << "Version: " << MESOS_VERSION;
   }
@@ -328,8 +330,8 @@ protected:
 
       if (credential.isSome()) {
         // Authenticate with the master.
-        // TODO(vinod): Do a backoff for authentication similar to what
-        // we do for registration.
+        // TODO(adam-mesos): Consider adding an initial delay like we do for
+        // slave registration, to combat thundering herds on master failover.
         authenticate();
       } else {
         // Proceed with registration without authentication.
@@ -456,8 +458,24 @@ protected:
       authenticating = None();
       reauthenticate = false;
 
+      ++failedAuthentications;
+
+      // Backoff.
+      // The backoff is a random duration in the interval [0, b * 2^N)
+      // where `b = authentication_backoff_factor` and `N` the number
+      // of failed authentication attempts. It is capped by
+      // `REGISTER_RETRY_INTERVAL_MAX`.
+      Duration backoff = flags.authentication_backoff_factor *
+                         std::pow(2, failedAuthentications);
+      backoff = std::min(backoff, scheduler::AUTHENTICATION_RETRY_INTERVAL_MAX);
+
+      // Determine the delay for next attempt by picking a random
+      // duration between 0 and 'maxBackoff'.
+      // TODO(vinod): Use random numbers from <random> header.
+      backoff *= (double) ::random() / RAND_MAX;
+
       // TODO(vinod): Add a limit on number of retries.
-      dispatch(self(), &Self::authenticate); // Retry.
+      delay(backoff, self(), &Self::authenticate);
       return;
     }
 
@@ -474,6 +492,8 @@ protected:
     authenticated = true;
     authenticating = None();
 
+    failedAuthentications = 0;
+
     doReliableRegistration(flags.registration_backoff_factor);
   }
 
@@ -1624,6 +1644,9 @@ private:
 
   // Indicates if a new authentication attempt should be enforced.
   bool reauthenticate;
+
+  // Indicates the number of failed authentication attempts.
+  uint64_t failedAuthentications;
 };
 
 } // namespace internal {

http://git-wip-us.apache.org/repos/asf/mesos/blob/b62d391f/src/slave/constants.cpp
----------------------------------------------------------------------
diff --git a/src/slave/constants.cpp b/src/slave/constants.cpp
index 0f0d8e4..8c4597e 100644
--- a/src/slave/constants.cpp
+++ b/src/slave/constants.cpp
@@ -51,6 +51,8 @@ const Duration DOCKER_INSPECT_DELAY = Seconds(1);
 // TODO(tnachen): Make this a flag.
 const Duration DOCKER_VERSION_WAIT_TIMEOUT = Seconds(5);
 const std::string DEFAULT_AUTHENTICATEE = "crammd5";
+const Duration AUTHENTICATION_RETRY_INTERVAL_MAX = Minutes(1);
+const Duration DEFAULT_AUTHENTICATION_BACKOFF_FACTOR = Seconds(1);
 const std::string COMMAND_EXECUTOR_ROOTFS_CONTAINER_PATH = ".rootfs";
 
 Duration DEFAULT_MASTER_PING_TIMEOUT()

http://git-wip-us.apache.org/repos/asf/mesos/blob/b62d391f/src/slave/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/constants.hpp b/src/slave/constants.hpp
index bcbb140..ac8d8be 100644
--- a/src/slave/constants.hpp
+++ b/src/slave/constants.hpp
@@ -105,6 +105,13 @@ extern const Duration DOCKER_VERSION_WAIT_TIMEOUT;
 // Name of the default, CRAM-MD5 authenticatee.
 extern const std::string DEFAULT_AUTHENTICATEE;
 
+// The maximum interval the slave waits before retrying authentication.
+extern const Duration AUTHENTICATION_RETRY_INTERVAL_MAX;
+
+// Default backoff interval used by the slave to wait after failed
+// authentication.
+extern const Duration DEFAULT_AUTHENTICATION_BACKOFF_FACTOR;
+
 // Default maximum storage space to be used by the fetcher cache.
 const Bytes DEFAULT_FETCHER_CACHE_SIZE = Gigabytes(2);
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/b62d391f/src/slave/flags.cpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.cpp b/src/slave/flags.cpp
index 2e88d74..70ce77c 100644
--- a/src/slave/flags.cpp
+++ b/src/slave/flags.cpp
@@ -223,6 +223,17 @@ mesos::internal::slave::Flags::Flags()
         stringify(REGISTER_RETRY_INTERVAL_MAX),
       DEFAULT_REGISTRATION_BACKOFF_FACTOR);
 
+  add(&Flags::authentication_backoff_factor,
+      "authentication_backoff_factor",
+      "After a failed authentication the agent picks a random amount of time\n"
+      "between `[0, b]`, where `b = authentication_backoff_factor`, to\n"
+      "authenticate with a new master. Subsequent retries are exponentially\n"
+      "backed off based on this interval (e.g., 1st retry uses a random\n"
+      "value between `[0, b * 2^1]`, 2nd retry between `[0, b * 2^2]`, 3rd\n"
+      "retry between `[0, b * 2^3]`, etc up to a maximum of " +
+          stringify(AUTHENTICATION_RETRY_INTERVAL_MAX),
+      DEFAULT_AUTHENTICATION_BACKOFF_FACTOR);
+
   add(&Flags::executor_environment_variables,
       "executor_environment_variables",
       "JSON object representing the environment\n"

http://git-wip-us.apache.org/repos/asf/mesos/blob/b62d391f/src/slave/flags.hpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.hpp b/src/slave/flags.hpp
index 1293397..13d5fb0 100644
--- a/src/slave/flags.hpp
+++ b/src/slave/flags.hpp
@@ -68,6 +68,7 @@ public:
 #endif // __WINDOWS__
   std::string frameworks_home;  // TODO(benh): Make an Option.
   Duration registration_backoff_factor;
+  Duration authentication_backoff_factor;
   Option<JSON::Object> executor_environment_variables;
   Duration executor_registration_timeout;
   Duration executor_shutdown_grace_period;

http://git-wip-us.apache.org/repos/asf/mesos/blob/b62d391f/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 6d9b4d0..57f03bf 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -19,6 +19,7 @@
 #include <stdlib.h> // For random().
 
 #include <algorithm>
+#include <cmath>
 #include <iomanip>
 #include <list>
 #include <map>
@@ -143,6 +144,7 @@ Slave::Slave(const slave::Flags& _flags,
     authenticating(None()),
     authenticated(false),
     reauthenticate(false),
+    failedAuthentications(0),
     executorDirectoryMaxAllowedAge(age(0)),
     resourceEstimator(_resourceEstimator),
     qosController(_qosController) {}
@@ -806,11 +808,8 @@ void Slave::detected(const Future<Option<MasterInfo>>& _master)
 
     if (credential.isSome()) {
       // Authenticate with the master.
-      // TODO(vinod): Do a backoff for authentication similar to what
-      // we do for registration. This is a little tricky because, if
-      // we delay 'Slave::authenticate' and a new master is detected
-      // before 'authenticate' event is processed the slave tries to
-      // authenticate with the new master twice.
+      // TODO(adam-mesos): Consider adding an initial delay like we do
+      // for registration, to combat thundering herds on master failover.
       // TODO(vinod): Consider adding an "AUTHENTICATED" state to the
       // slave instead of "authenticate" variable.
       authenticate();
@@ -915,8 +914,24 @@ void Slave::_authenticate()
     authenticating = None();
     reauthenticate = false;
 
+    ++failedAuthentications;
+
+    // Backoff.
+    // The backoff is a random duration in the interval [0, b * 2^N)
+    // where `b = authentication_backoff_factor` and `N` the number
+    // of failed authentication attempts. It is capped by
+    // `REGISTER_RETRY_INTERVAL_MAX`.
+    Duration backoff =
+      flags.authentication_backoff_factor * std::pow(2, failedAuthentications);
+    backoff = std::min(backoff, AUTHENTICATION_RETRY_INTERVAL_MAX);
+
+    // Determine the delay for next attempt by picking a random
+    // duration between 0 and 'maxBackoff'.
+    // TODO(vinod): Use random numbers from <random> header.
+    backoff *= (double) ::random() / RAND_MAX;
+
     // TODO(vinod): Add a limit on number of retries.
-    dispatch(self(), &Self::authenticate); // Retry.
+    delay(backoff, self(), &Self::authenticate); // Retry.
     return;
   }
 
@@ -931,6 +946,8 @@ void Slave::_authenticate()
   authenticated = true;
   authenticating = None();
 
+  failedAuthentications = 0;
+
   // Proceed with registration.
   doReliableRegistration(flags.registration_backoff_factor * 2);
 }

http://git-wip-us.apache.org/repos/asf/mesos/blob/b62d391f/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index b7586ce..90f80a5 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -553,6 +553,9 @@ private:
   // Indicates if a new authentication attempt should be enforced.
   bool reauthenticate;
 
+  // Indicates the number of failed authentication attempts.
+  uint64_t failedAuthentications;
+
   // Maximum age of executor directories. Will be recomputed
   // periodically every flags.disk_watch_interval.
   Duration executorDirectoryMaxAllowedAge;