You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2017/05/23 22:51:54 UTC
[1/2] mesos git commit: Added new agent flag
'executor_reregistration_timeout'.
Repository: mesos
Updated Branches:
refs/heads/master d9fbff98d -> 044196ff6
Added new agent flag 'executor_reregistration_timeout'.
This patch adds a new agent flag, 'executor_reregistration_timeout',
which allows operators to configure the time which an executor has
to reregister with the agent before it will be considered gone and
terminated.
Review: https://reviews.apache.org/r/59460/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/4a5c2620
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/4a5c2620
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/4a5c2620
Branch: refs/heads/master
Commit: 4a5c262063af7c69724ee0a78901746580d807cb
Parents: d9fbff9
Author: Greg Mann <gr...@mesosphere.io>
Authored: Tue May 23 15:36:36 2017 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Tue May 23 15:36:36 2017 -0700
----------------------------------------------------------------------
docs/configuration.md | 11 +++++++++++
src/slave/constants.hpp | 11 ++++++++++-
src/slave/flags.cpp | 16 ++++++++++++++++
src/slave/flags.hpp | 1 +
src/slave/slave.cpp | 8 +++++---
5 files changed, 43 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/4a5c2620/docs/configuration.md
----------------------------------------------------------------------
diff --git a/docs/configuration.md b/docs/configuration.md
index c5194e7..59e1bbe 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1451,6 +1451,17 @@ shutting it down (e.g., 60secs, 3mins, etc) (default: 1mins)
</tr>
<tr>
<td>
+ --executor_reregistration_timeout=VALUE
+ </td>
+ <td>
+The timeout within which an executor is expected to re-register after
+the agent has restarted, before the agent considers it gone and shuts
+it down. Note that currently, the agent will not re-register with the
+master until this timeout has elapsed (see MESOS-7539). (default: 2secs)
+ </td>
+</tr>
+<tr>
+ <td>
--max_completed_executors_per_framework
</td>
<td>
http://git-wip-us.apache.org/repos/asf/mesos/blob/4a5c2620/src/slave/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/constants.hpp b/src/slave/constants.hpp
index 9c1d724..435c37e 100644
--- a/src/slave/constants.hpp
+++ b/src/slave/constants.hpp
@@ -33,7 +33,16 @@ namespace slave {
// details in MESOS-1023.
constexpr Duration EXECUTOR_REGISTRATION_TIMEOUT = Minutes(1);
-constexpr Duration EXECUTOR_REREGISTER_TIMEOUT = Seconds(2);
+constexpr Duration EXECUTOR_REREGISTRATION_TIMEOUT = Seconds(2);
+
+// The maximum timeout within which an executor can re-register.
+// Note that this value has to be << 'MIN_AGENT_REREGISTER_TIMEOUT'
+// declared in 'master/constants.hpp'; since agent recovery will only
+// complete after this timeout has elapsed, this ensures that the
+// agent can re-register with the master before it is marked
+// unreachable and its tasks are transitioned to TASK_UNREACHABLE or
+// TASK_LOST.
+constexpr Duration MAX_EXECUTOR_REREGISTRATION_TIMEOUT = Seconds(15);
// The default amount of time to wait for the executor to
// shut down before destroying the container.
http://git-wip-us.apache.org/repos/asf/mesos/blob/4a5c2620/src/slave/flags.cpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.cpp b/src/slave/flags.cpp
index c1e9568..0c8276e 100644
--- a/src/slave/flags.cpp
+++ b/src/slave/flags.cpp
@@ -332,6 +332,22 @@ mesos::internal::slave::Flags::Flags()
"shutting it down (e.g., 60secs, 3mins, etc)",
EXECUTOR_REGISTRATION_TIMEOUT);
+ add(&Flags::executor_reregistration_timeout,
+ "executor_reregistration_timeout",
+ "The timeout within which an executor is expected to re-register after\n"
+ "the agent has restarted, before the agent considers it gone and shuts\n"
+ "it down. Note that currently, the agent will not re-register with the\n"
+ "master until this timeout has elapsed (see MESOS-7539).",
+ EXECUTOR_REREGISTRATION_TIMEOUT,
+ [](const Duration& value) -> Option<Error> {
+ if (value > MAX_EXECUTOR_REREGISTRATION_TIMEOUT) {
+ return Error("Expected `--executor_reregistration_timeout` "
+ "to be not more than " +
+ stringify(MAX_EXECUTOR_REREGISTRATION_TIMEOUT));
+ }
+ return None();
+ });
+
add(&Flags::executor_shutdown_grace_period,
"executor_shutdown_grace_period",
"Default amount of time to wait for an executor to shut down\n"
http://git-wip-us.apache.org/repos/asf/mesos/blob/4a5c2620/src/slave/flags.hpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.hpp b/src/slave/flags.hpp
index b5dd841..b669956 100644
--- a/src/slave/flags.hpp
+++ b/src/slave/flags.hpp
@@ -77,6 +77,7 @@ public:
Duration authentication_backoff_factor;
Option<JSON::Object> executor_environment_variables;
Duration executor_registration_timeout;
+ Duration executor_reregistration_timeout;
Duration executor_shutdown_grace_period;
#ifdef USE_SSL_SOCKET
Option<Path> executor_secret_key;
http://git-wip-us.apache.org/repos/asf/mesos/blob/4a5c2620/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 14de72f..15e4d68 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -4186,7 +4186,7 @@ void Slave::reregisterExecutorTimeout()
TaskStatus::REASON_EXECUTOR_REREGISTRATION_TIMEOUT);
termination.set_message(
"Executor did not re-register within " +
- stringify(EXECUTOR_REREGISTER_TIMEOUT));
+ stringify(flags.executor_reregistration_timeout));
executor->pendingTermination = termination;
break;
@@ -4200,6 +4200,8 @@ void Slave::reregisterExecutorTimeout()
}
// Signal the end of recovery.
+ // TODO(greggomann): Allow the agent to complete recovery before the executor
+ // re-registration timeout has elapsed. See MESOS-7539
recoveryInfo.recovered.set(Nothing());
}
@@ -5945,7 +5947,7 @@ Future<Nothing> Slave::_recover()
if (!frameworks.empty() && flags.recover == "reconnect") {
// Cleanup unregistered executors after a delay.
- delay(EXECUTOR_REREGISTER_TIMEOUT,
+ delay(flags.executor_reregistration_timeout,
self(),
&Slave::reregisterExecutorTimeout);
@@ -7619,7 +7621,7 @@ map<string, string> executorEnvironment(
// The maximum backoff duration to be used by an executor between two
// retries when disconnected.
environment["MESOS_SUBSCRIPTION_BACKOFF_MAX"] =
- stringify(EXECUTOR_REREGISTER_TIMEOUT);
+ stringify(flags.executor_reregistration_timeout);
}
if (authenticationToken.isSome()) {
[2/2] mesos git commit: Added 'executor_reregister_timeout' agent
flag to the tests.
Posted by bm...@apache.org.
Added 'executor_reregister_timeout' agent flag to the tests.
This patch updates the tests to make use of the new
'executor_reregister_timeout' agent flag.
Review: https://reviews.apache.org/r/59461/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/044196ff
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/044196ff
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/044196ff
Branch: refs/heads/master
Commit: 044196ff6d088675ee1773fca7800af89f743051
Parents: 4a5c262
Author: Greg Mann <gr...@mesosphere.io>
Authored: Tue May 23 15:37:34 2017 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Tue May 23 15:37:34 2017 -0700
----------------------------------------------------------------------
src/tests/containerizer/port_mapping_tests.cpp | 4 ++--
src/tests/disk_quota_tests.cpp | 2 +-
src/tests/master_tests.cpp | 4 ++--
src/tests/persistent_volume_tests.cpp | 2 +-
src/tests/slave_recovery_tests.cpp | 26 ++++++++++-----------
src/tests/slave_tests.cpp | 2 +-
6 files changed, 20 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/044196ff/src/tests/containerizer/port_mapping_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/port_mapping_tests.cpp b/src/tests/containerizer/port_mapping_tests.cpp
index a528382..d062f2f 100644
--- a/src/tests/containerizer/port_mapping_tests.cpp
+++ b/src/tests/containerizer/port_mapping_tests.cpp
@@ -1962,7 +1962,7 @@ TEST_F(PortMappingMesosTest, CGROUPS_ROOT_RecoverMixedContainers)
AWAIT_READY(_recover1);
Clock::settle(); // Wait for slave to schedule reregister timeout.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(slaveFlags.executor_reregistration_timeout);
AWAIT_READY(slaveReregisteredMessage1);
@@ -2011,7 +2011,7 @@ TEST_F(PortMappingMesosTest, CGROUPS_ROOT_RecoverMixedContainers)
AWAIT_READY(_recover2);
Clock::settle(); // Wait for slave to schedule reregister timeout.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(slaveFlags.executor_reregistration_timeout);
AWAIT_READY(slaveReregisteredMessage2);
http://git-wip-us.apache.org/repos/asf/mesos/blob/044196ff/src/tests/disk_quota_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/disk_quota_tests.cpp b/src/tests/disk_quota_tests.cpp
index f758bee..9fe2c86 100644
--- a/src/tests/disk_quota_tests.cpp
+++ b/src/tests/disk_quota_tests.cpp
@@ -701,7 +701,7 @@ TEST_F(DiskQuotaTest, SlaveRecovery)
AWAIT_READY(reregisterExecutorMessage);
// Ensure the slave considers itself recovered.
- Clock::advance(slave::EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
// NOTE: We resume the clock because we need the reaper to reap the
// 'du' subprocess.
http://git-wip-us.apache.org/repos/asf/mesos/blob/044196ff/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 754f7bd..1dfe5fd 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -6442,7 +6442,7 @@ TEST_F(MasterTest, AgentRestartNoReregister)
// The agent waits for the executor reregister timeout to expire,
// even if all executors have re-reregistered.
- Clock::advance(slave::EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(agentFlags.executor_reregistration_timeout);
Clock::settle();
// Agent will try to re-register after completing recovery; prevent
@@ -6473,7 +6473,7 @@ TEST_F(MasterTest, AgentRestartNoReregister)
.WillOnce(FutureArg<1>(&unreachableStatus));
Duration elapsedTime =
- masterFlags.agent_ping_timeout + slave::EXECUTOR_REREGISTER_TIMEOUT;
+ masterFlags.agent_ping_timeout + agentFlags.executor_reregistration_timeout;
Duration remainingReregisterTime =
masterFlags.agent_reregister_timeout - elapsedTime;
http://git-wip-us.apache.org/repos/asf/mesos/blob/044196ff/src/tests/persistent_volume_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/persistent_volume_tests.cpp b/src/tests/persistent_volume_tests.cpp
index ef00853..ce716bc 100644
--- a/src/tests/persistent_volume_tests.cpp
+++ b/src/tests/persistent_volume_tests.cpp
@@ -1955,7 +1955,7 @@ TEST_P(PersistentVolumeTest, SlaveRecovery)
Clock::settle();
// Ensure the slave considers itself recovered.
- Clock::advance(slave::EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(slaveFlags.executor_reregistration_timeout);
Clock::resume();
http://git-wip-us.apache.org/repos/asf/mesos/blob/044196ff/src/tests/slave_recovery_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_recovery_tests.cpp b/src/tests/slave_recovery_tests.cpp
index 52e78b6..0aa87f5 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -889,7 +889,7 @@ TYPED_TEST(SlaveRecoveryTest, DISABLED_RecoverUnregisteredHTTPExecutor)
Clock::settle(); // Wait for slave to schedule reregister timeout.
// Ensure the slave considers itself recovered.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
// Now advance time until the reaper reaps the executor.
while (status.isPending()) {
@@ -1001,7 +1001,7 @@ TYPED_TEST(SlaveRecoveryTest, RecoverUnregisteredExecutor)
Clock::settle(); // Wait for slave to schedule reregister timeout.
// Ensure the slave considers itself recovered.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
// Now advance time until the reaper reaps the executor.
while (status.isPending()) {
@@ -1119,7 +1119,7 @@ TYPED_TEST(SlaveRecoveryTest, KillTaskUnregisteredExecutor)
Clock::settle(); // Wait for the agent to schedule reregister timeout.
// Ensure the agent considers itself recovered.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
while(executorTerminated.isPending()) {
Clock::advance(process::MAX_REAP_INTERVAL());
@@ -1256,7 +1256,7 @@ TYPED_TEST(SlaveRecoveryTest, RecoverTerminatedHTTPExecutor)
Clock::settle(); // Wait for slave to schedule reregister timeout.
// Ensure the slave considers itself recovered.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
// Now advance time until the reaper reaps the executor.
while (status.isPending()) {
@@ -1376,7 +1376,7 @@ TYPED_TEST(SlaveRecoveryTest, RecoverTerminatedExecutor)
Clock::settle(); // Wait for slave to schedule reregister timeout.
// Ensure the slave considers itself recovered.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
// Now advance time until the reaper reaps the executor.
while (status.isPending()) {
@@ -1493,7 +1493,7 @@ TYPED_TEST(SlaveRecoveryTest, DISABLED_RecoveryTimeout)
AWAIT_READY(_recover);
// Ensure the slave considers itself recovered.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
Clock::resume();
// Scheduler should receive the TASK_FAILED update.
@@ -2206,7 +2206,7 @@ TYPED_TEST(SlaveRecoveryTest, KillTask)
AWAIT_READY(reregisterExecutorMessage);
// Ensure the slave considers itself recovered.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
Clock::resume();
// Wait for the slave to re-register.
@@ -2497,7 +2497,7 @@ TYPED_TEST(SlaveRecoveryTest, GCExecutor)
Clock::settle(); // Wait for slave to schedule reregister timeout.
// Ensure the slave considers itself recovered.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
AWAIT_READY(slaveReregisteredMessage);
@@ -3388,7 +3388,7 @@ TYPED_TEST(SlaveRecoveryTest, SchedulerFailover)
AWAIT_READY(reregisterExecutorMessage);
// Ensure the slave considers itself recovered.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
Clock::resume();
// Wait for the slave to re-register.
@@ -3534,7 +3534,7 @@ TYPED_TEST(SlaveRecoveryTest, MasterFailover)
AWAIT_READY(reregisterExecutorMessage);
// Ensure the slave considers itself recovered.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
Clock::resume();
// Wait for the slave to re-register.
@@ -3693,7 +3693,7 @@ TYPED_TEST(SlaveRecoveryTest, MultipleFrameworks)
AWAIT_READY(reregisterExecutorMessage2);
// Ensure the slave considers itself recovered.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
Clock::resume();
// Wait for the slave to re-register.
@@ -3869,7 +3869,7 @@ TYPED_TEST(SlaveRecoveryTest, MultipleSlaves)
AWAIT_READY(reregisterExecutorMessage2);
// Ensure the slave considers itself recovered.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags1.executor_reregistration_timeout);
Clock::resume();
// Wait for the slaves to re-register.
@@ -3990,7 +3990,7 @@ TYPED_TEST(SlaveRecoveryTest, RestartBeforeContainerizerLaunch)
Clock::settle(); // Wait for slave to schedule reregister timeout.
// Ensure the slave considers itself recovered.
- Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
Clock::resume();
// Scheduler should receive the TASK_FAILED update.
http://git-wip-us.apache.org/repos/asf/mesos/blob/044196ff/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 8f81f77..d124d59 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -5005,7 +5005,7 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(SlaveTest, HTTPSchedulerSlaveRestart)
Clock::settle();
// Ensure the slave considers itself recovered.
- Clock::advance(slave::EXECUTOR_REREGISTER_TIMEOUT);
+ Clock::advance(flags.executor_reregistration_timeout);
Clock::resume();