You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2014/11/07 03:19:32 UTC
mesos git commit: Updated scheduler driver to exponentially backoff
during registration retries.
Repository: mesos
Updated Branches:
refs/heads/master 12d61403a -> d9f85d7f7
Updated scheduler driver to exponentially backoff during registration
retries.
Review: https://reviews.apache.org/r/27315
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/d9f85d7f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/d9f85d7f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/d9f85d7f
Branch: refs/heads/master
Commit: d9f85d7f7209cf24a4893156253909343ff12504
Parents: 12d6140
Author: Vinod Kone <vi...@gmail.com>
Authored: Thu Nov 6 18:19:11 2014 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Thu Nov 6 18:19:13 2014 -0800
----------------------------------------------------------------------
src/Makefile.am | 3 ++
src/sched/constants.cpp | 36 ++++++++++++++++++++
src/sched/constants.hpp | 40 ++++++++++++++++++++++
src/sched/flags.hpp | 55 ++++++++++++++++++++++++++++++
src/sched/sched.cpp | 58 +++++++++++++++++++++++++++++---
src/slave/slave.cpp | 18 +++++-----
src/slave/slave.hpp | 2 +-
src/tests/fault_tolerance_tests.cpp | 7 ++--
8 files changed, 200 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/Makefile.am
----------------------------------------------------------------------
diff --git a/src/Makefile.am b/src/Makefile.am
index 9ab3b9c..443554a 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -279,6 +279,7 @@ libmesos_no_3rdparty_la_SOURCES = \
master/registrar.cpp \
master/repairer.cpp \
module/manager.cpp \
+ sched/constants.cpp \
sched/sched.cpp \
scheduler/scheduler.cpp \
slave/constants.cpp \
@@ -431,6 +432,8 @@ libmesos_no_3rdparty_la_SOURCES += \
module/authenticator.hpp \
module/isolator.hpp \
module/manager.hpp \
+ sched/constants.hpp \
+ sched/flags.hpp \
slave/constants.hpp \
slave/flags.hpp \
slave/gc.hpp \
http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/sched/constants.cpp
----------------------------------------------------------------------
diff --git a/src/sched/constants.cpp b/src/sched/constants.cpp
new file mode 100644
index 0000000..44ccfbe
--- /dev/null
+++ b/src/sched/constants.cpp
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sched/constants.hpp"
+
+namespace mesos {
+namespace internal {
+namespace scheduler {
+
+// NOTE: The default backoff factor for the scheduler (2s) is
+// different from the slave (1s) because the scheduler driver doesn't
+// do an initial backoff for the very first attempt unlike the slave.
+// TODO(vinod): Once we fix the scheduler driver to do initial backoff
+// we can change the default to 1s.
+const Duration REGISTRATION_BACKOFF_FACTOR = Seconds(2);
+
+const Duration REGISTRATION_RETRY_INTERVAL_MAX = Minutes(1);
+
+} // namespace scheduler {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/sched/constants.hpp
----------------------------------------------------------------------
diff --git a/src/sched/constants.hpp b/src/sched/constants.hpp
new file mode 100644
index 0000000..63707a8
--- /dev/null
+++ b/src/sched/constants.hpp
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SCHED_CONSTANTS_HPP__
+#define __SCHED_CONSTANTS_HPP__
+
+#include <stout/duration.hpp>
+
+namespace mesos {
+namespace internal {
+namespace scheduler {
+
+// Default backoff interval used by the scheduler driver to wait
+// before registration.
+extern const Duration REGISTRATION_BACKOFF_FACTOR;
+
+// The maximum interval the scheduler driver waits before retrying
+// registration.
+extern const Duration REGISTRATION_RETRY_INTERVAL_MAX;
+
+} // namespace scheduler {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __SCHED_CONSTANTS_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/sched/flags.hpp
----------------------------------------------------------------------
diff --git a/src/sched/flags.hpp b/src/sched/flags.hpp
new file mode 100644
index 0000000..62a634b
--- /dev/null
+++ b/src/sched/flags.hpp
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SCHED_FLAGS_HPP__
+#define __SCHED_FLAGS_HPP__
+
+#include <stout/flags.hpp>
+
+#include "logging/flags.hpp"
+
+#include "sched/constants.hpp"
+
+namespace mesos {
+namespace internal {
+namespace scheduler {
+
+class Flags : public logging::Flags
+{
+public:
+ Flags()
+ {
+ add(&Flags::registration_backoff_factor,
+ "registration_backoff_factor",
+ "Scheduler driver (re-)registration retries are exponentially backed\n"
+ "off based on 'b', the registration backoff factor (e.g., 1st retry\n"
+ "uses a random value between [0, b], 2nd retry between [0, b * 2^1],\n"
+ "3rd retry between [0, b * 2^2]...) up to a maximum of (framework\n"
+ "failover timeout/10, if failover timeout is specified) or " +
+ stringify(REGISTRATION_RETRY_INTERVAL_MAX) + ", whichever is smaller",
+ REGISTRATION_BACKOFF_FACTOR);
+ }
+
+ Duration registration_backoff_factor;
+};
+
+} // namespace scheduler {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __SCHED_FLAGS_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/sched/sched.cpp
----------------------------------------------------------------------
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index e5f828d..8ca0526 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -64,6 +64,7 @@
#include "common/lock.hpp"
#include "common/type_utils.hpp"
+#include "local/flags.hpp"
#include "local/local.hpp"
#include "logging/flags.hpp"
@@ -73,6 +74,9 @@
#include "messages/messages.hpp"
+#include "sched/constants.hpp"
+#include "sched/flags.hpp"
+
using namespace mesos;
using namespace mesos::internal;
using namespace mesos::internal::master;
@@ -105,6 +109,7 @@ public:
const Option<Credential>& _credential,
const string& schedulerId,
MasterDetector* _detector,
+ const internal::scheduler::Flags& _flags,
pthread_mutex_t* _mutex,
pthread_cond_t* _cond)
// We use a UUID here to ensure that the master can reliably
@@ -128,6 +133,7 @@ public:
connected(false),
aborted(false),
detector(_detector),
+ flags(_flags),
credential(_credential),
authenticatee(NULL),
authenticating(None()),
@@ -235,13 +241,19 @@ protected:
if (credential.isSome()) {
// Authenticate with the master.
+ // TODO(vinod): Do a backoff for authentication similar to what
+ // we do for registration.
authenticate();
} else {
// Proceed with registration without authentication.
LOG(INFO) << "No credentials provided."
<< " Attempting to register without authentication";
- doReliableRegistration();
+ // TODO(vinod): Similar to the slave add a random delay to the
+ // first registration attempt too. This needs fixing tests
+ // that expect scheduler to register even with clock paused
+ // (e.g., rate limiting tests).
+ doReliableRegistration(flags.registration_backoff_factor);
}
} else {
// In this case, we don't actually invoke Scheduler::error
@@ -359,7 +371,7 @@ protected:
authenticated = true;
authenticating = None();
- doReliableRegistration(); // Proceed with registration.
+ doReliableRegistration(flags.registration_backoff_factor);
}
void authenticationTimeout(Future<bool> future)
@@ -463,7 +475,7 @@ protected:
VLOG(1) << "Scheduler::reregistered took " << stopwatch.elapsed();
}
- void doReliableRegistration()
+ void doReliableRegistration(Duration maxBackoff)
{
if (connected || master.isNone()) {
return;
@@ -488,7 +500,29 @@ protected:
send(master.get(), message);
}
- delay(Seconds(1), self(), &Self::doReliableRegistration);
+ // Bound the maximum backoff by 'REGISTRATION_RETRY_INTERVAL_MAX'.
+ maxBackoff =
+ std::min(maxBackoff, scheduler::REGISTRATION_RETRY_INTERVAL_MAX);
+
+ // If failover timeout is present, bound the maximum backoff
+ // by 1/10th of the failover timeout.
+ if (framework.has_failover_timeout()) {
+ Try<Duration> duration = Duration::create(framework.failover_timeout());
+ if (duration.isSome()) {
+ maxBackoff = std::min(maxBackoff, duration.get() / 10);
+ }
+ }
+
+ // Determine the delay for next attempt by picking a random
+ // duration between 0 and 'maxBackoff'.
+ // TODO(vinod): Use random numbers from <random> header.
+ Duration delay = maxBackoff * ((double) ::random() / RAND_MAX);
+
+ VLOG(1) << "Will retry registration in " << delay << " if necessary";
+
+ // Backoff.
+ process::delay(
+ delay, self(), &Self::doReliableRegistration, maxBackoff * 2);
}
void resourceOffers(
@@ -1021,6 +1055,8 @@ private:
MasterDetector* detector;
+ const internal::scheduler::Flags flags;
+
hashmap<OfferID, hashmap<SlaveID, UPID> > savedOffers;
hashmap<SlaveID, UPID> savedSlavePids;
@@ -1051,7 +1087,6 @@ void MesosSchedulerDriver::initialize() {
// we'll probably want a way to load master::Flags and slave::Flags
// as well.
local::Flags flags;
-
Try<Nothing> load = flags.load("MESOS_");
if (load.isError()) {
@@ -1116,6 +1151,7 @@ void MesosSchedulerDriver::initialize() {
url = pid.isSome() ? static_cast<string>(pid.get()) : master;
}
+
// Implementation of C++ API.
//
// Notes:
@@ -1230,6 +1266,16 @@ Status MesosSchedulerDriver::start()
detector = detector_.get();
}
+ // Load scheduler flags.
+ internal::scheduler::Flags flags;
+ Try<Nothing> load = flags.load("MESOS_");
+
+ if (load.isError()) {
+ status = DRIVER_ABORTED;
+ scheduler->error(this, load.error());
+ return status;
+ }
+
CHECK(process == NULL);
if (credential == NULL) {
@@ -1240,6 +1286,7 @@ Status MesosSchedulerDriver::start()
None(),
schedulerId,
detector,
+ flags,
&mutex,
&cond);
} else {
@@ -1251,6 +1298,7 @@ Status MesosSchedulerDriver::start()
cred,
schedulerId,
detector,
+ flags,
&mutex,
&cond);
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index dbfd1a8..81e0c4b 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -906,7 +906,7 @@ void Slave::reregistered(
}
-void Slave::doReliableRegistration(const Duration& duration)
+void Slave::doReliableRegistration(Duration maxBackoff)
{
if (master.isNone()) {
LOG(INFO) << "Skipping registration because no master present";
@@ -1040,19 +1040,17 @@ void Slave::doReliableRegistration(const Duration& duration)
send(master.get(), message);
}
- // Retry registration if necessary.
- Duration next = std::min(
- duration * ((double) ::random() / RAND_MAX),
- REGISTER_RETRY_INTERVAL_MAX);
+ // Bound the maximum backoff by 'REGISTER_RETRY_INTERVAL_MAX'.
+ maxBackoff = std::min(maxBackoff, REGISTER_RETRY_INTERVAL_MAX);
- Duration duration_ = std::min(
- duration * 2,
- REGISTER_RETRY_INTERVAL_MAX);
+ // Determine the delay for next attempt by picking a random
+ // duration between 0 and 'maxBackoff'.
+ Duration delay = maxBackoff * ((double) ::random() / RAND_MAX);
- VLOG(1) << "Will retry registration in " << next << " if necessary";
+ VLOG(1) << "Will retry registration in " << delay << " if necessary";
// Backoff.
- delay(next, self(), &Slave::doReliableRegistration, duration_);
+ process::delay(delay, self(), &Slave::doReliableRegistration, maxBackoff * 2);
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index 5b082fc..72bbec9 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -104,7 +104,7 @@ public:
const SlaveID& slaveId,
const std::vector<ReconcileTasksMessage>& reconciliations);
- void doReliableRegistration(const Duration& duration);
+ void doReliableRegistration(Duration maxBackoff);
// Made 'virtual' for Slave mocking.
virtual void runTask(
http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/tests/fault_tolerance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp
index 372c4fd..5baeda6 100644
--- a/src/tests/fault_tolerance_tests.cpp
+++ b/src/tests/fault_tolerance_tests.cpp
@@ -40,6 +40,8 @@
#include "master/allocator.hpp"
#include "master/master.hpp"
+#include "sched/constants.hpp"
+
#include "slave/constants.hpp"
#include "slave/slave.hpp"
@@ -803,7 +805,7 @@ TEST_F(FaultToleranceTest, SchedulerFailoverRetriedReregistration)
AWAIT_READY(reregistrationMessage);
// Trigger the re-registration retry.
- Clock::advance(Seconds(1));
+ Clock::advance(internal::scheduler::REGISTRATION_BACKOFF_FACTOR);
AWAIT_READY(sched2Registered);
@@ -856,9 +858,8 @@ TEST_F(FaultToleranceTest, FrameworkReliableRegistration)
AWAIT_READY(frameworkRegisteredMessage);
- // TODO(benh): Pull out constant from SchedulerProcess.
Clock::pause();
- Clock::advance(Seconds(1));
+ Clock::advance(internal::scheduler::REGISTRATION_BACKOFF_FACTOR);
AWAIT_READY(registered); // Ensures registered message is received.