You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2017/05/23 22:51:54 UTC

[1/2] mesos git commit: Added new agent flag 'executor_reregistration_timeout'.

Repository: mesos
Updated Branches:
  refs/heads/master d9fbff98d -> 044196ff6


Added new agent flag 'executor_reregistration_timeout'.

This patch adds a new agent flag, 'executor_reregistration_timeout',
which allows operators to configure the time which an executor has
to reregister with the agent before it will be considered gone and
terminated.

Review: https://reviews.apache.org/r/59460/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/4a5c2620
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/4a5c2620
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/4a5c2620

Branch: refs/heads/master
Commit: 4a5c262063af7c69724ee0a78901746580d807cb
Parents: d9fbff9
Author: Greg Mann <gr...@mesosphere.io>
Authored: Tue May 23 15:36:36 2017 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Tue May 23 15:36:36 2017 -0700

----------------------------------------------------------------------
 docs/configuration.md   | 11 +++++++++++
 src/slave/constants.hpp | 11 ++++++++++-
 src/slave/flags.cpp     | 16 ++++++++++++++++
 src/slave/flags.hpp     |  1 +
 src/slave/slave.cpp     |  8 +++++---
 5 files changed, 43 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/4a5c2620/docs/configuration.md
----------------------------------------------------------------------
diff --git a/docs/configuration.md b/docs/configuration.md
index c5194e7..59e1bbe 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1451,6 +1451,17 @@ shutting it down (e.g., 60secs, 3mins, etc) (default: 1mins)
 </tr>
 <tr>
   <td>
+    --executor_reregistration_timeout=VALUE
+  </td>
+  <td>
+The timeout within which an executor is expected to re-register after
+the agent has restarted, before the agent considers it gone and shuts
+it down. Note that currently, the agent will not re-register with the
+master until this timeout has elapsed (see MESOS-7539). (default: 2secs)
+  </td>
+</tr>
+<tr>
+  <td>
     --max_completed_executors_per_framework
   </td>
   <td>

http://git-wip-us.apache.org/repos/asf/mesos/blob/4a5c2620/src/slave/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/constants.hpp b/src/slave/constants.hpp
index 9c1d724..435c37e 100644
--- a/src/slave/constants.hpp
+++ b/src/slave/constants.hpp
@@ -33,7 +33,16 @@ namespace slave {
 // details in MESOS-1023.
 
 constexpr Duration EXECUTOR_REGISTRATION_TIMEOUT = Minutes(1);
-constexpr Duration EXECUTOR_REREGISTER_TIMEOUT = Seconds(2);
+constexpr Duration EXECUTOR_REREGISTRATION_TIMEOUT = Seconds(2);
+
+// The maximum timeout within which an executor can re-register.
+// Note that this value has to be << 'MIN_AGENT_REREGISTER_TIMEOUT'
+// declared in 'master/constants.hpp'; since agent recovery will only
+// complete after this timeout has elapsed, this ensures that the
+// agent can re-register with the master before it is marked
+// unreachable and its tasks are transitioned to TASK_UNREACHABLE or
+// TASK_LOST.
+constexpr Duration MAX_EXECUTOR_REREGISTRATION_TIMEOUT = Seconds(15);
 
 // The default amount of time to wait for the executor to
 // shut down before destroying the container.

http://git-wip-us.apache.org/repos/asf/mesos/blob/4a5c2620/src/slave/flags.cpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.cpp b/src/slave/flags.cpp
index c1e9568..0c8276e 100644
--- a/src/slave/flags.cpp
+++ b/src/slave/flags.cpp
@@ -332,6 +332,22 @@ mesos::internal::slave::Flags::Flags()
       "shutting it down (e.g., 60secs, 3mins, etc)",
       EXECUTOR_REGISTRATION_TIMEOUT);
 
+  add(&Flags::executor_reregistration_timeout,
+      "executor_reregistration_timeout",
+      "The timeout within which an executor is expected to re-register after\n"
+      "the agent has restarted, before the agent considers it gone and shuts\n"
+      "it down. Note that currently, the agent will not re-register with the\n"
+      "master until this timeout has elapsed (see MESOS-7539).",
+      EXECUTOR_REREGISTRATION_TIMEOUT,
+      [](const Duration& value) -> Option<Error> {
+        if (value > MAX_EXECUTOR_REREGISTRATION_TIMEOUT) {
+          return Error("Expected `--executor_reregistration_timeout` "
+                       "to be not more than " +
+                       stringify(MAX_EXECUTOR_REREGISTRATION_TIMEOUT));
+        }
+        return None();
+      });
+
   add(&Flags::executor_shutdown_grace_period,
       "executor_shutdown_grace_period",
       "Default amount of time to wait for an executor to shut down\n"

http://git-wip-us.apache.org/repos/asf/mesos/blob/4a5c2620/src/slave/flags.hpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.hpp b/src/slave/flags.hpp
index b5dd841..b669956 100644
--- a/src/slave/flags.hpp
+++ b/src/slave/flags.hpp
@@ -77,6 +77,7 @@ public:
   Duration authentication_backoff_factor;
   Option<JSON::Object> executor_environment_variables;
   Duration executor_registration_timeout;
+  Duration executor_reregistration_timeout;
   Duration executor_shutdown_grace_period;
 #ifdef USE_SSL_SOCKET
   Option<Path> executor_secret_key;

http://git-wip-us.apache.org/repos/asf/mesos/blob/4a5c2620/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 14de72f..15e4d68 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -4186,7 +4186,7 @@ void Slave::reregisterExecutorTimeout()
               TaskStatus::REASON_EXECUTOR_REREGISTRATION_TIMEOUT);
           termination.set_message(
               "Executor did not re-register within " +
-              stringify(EXECUTOR_REREGISTER_TIMEOUT));
+              stringify(flags.executor_reregistration_timeout));
 
           executor->pendingTermination = termination;
           break;
@@ -4200,6 +4200,8 @@ void Slave::reregisterExecutorTimeout()
   }
 
   // Signal the end of recovery.
+  // TODO(greggomann): Allow the agent to complete recovery before the executor
+  // re-registration timeout has elapsed. See MESOS-7539
   recoveryInfo.recovered.set(Nothing());
 }
 
@@ -5945,7 +5947,7 @@ Future<Nothing> Slave::_recover()
 
   if (!frameworks.empty() && flags.recover == "reconnect") {
     // Cleanup unregistered executors after a delay.
-    delay(EXECUTOR_REREGISTER_TIMEOUT,
+    delay(flags.executor_reregistration_timeout,
           self(),
           &Slave::reregisterExecutorTimeout);
 
@@ -7619,7 +7621,7 @@ map<string, string> executorEnvironment(
     // The maximum backoff duration to be used by an executor between two
     // retries when disconnected.
     environment["MESOS_SUBSCRIPTION_BACKOFF_MAX"] =
-      stringify(EXECUTOR_REREGISTER_TIMEOUT);
+      stringify(flags.executor_reregistration_timeout);
   }
 
   if (authenticationToken.isSome()) {


[2/2] mesos git commit: Added 'executor_reregister_timeout' agent flag to the tests.

Posted by bm...@apache.org.
Added 'executor_reregister_timeout' agent flag to the tests.

This patch updates the tests to make use of the new
'executor_reregister_timeout' agent flag.

Review: https://reviews.apache.org/r/59461/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/044196ff
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/044196ff
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/044196ff

Branch: refs/heads/master
Commit: 044196ff6d088675ee1773fca7800af89f743051
Parents: 4a5c262
Author: Greg Mann <gr...@mesosphere.io>
Authored: Tue May 23 15:37:34 2017 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Tue May 23 15:37:34 2017 -0700

----------------------------------------------------------------------
 src/tests/containerizer/port_mapping_tests.cpp |  4 ++--
 src/tests/disk_quota_tests.cpp                 |  2 +-
 src/tests/master_tests.cpp                     |  4 ++--
 src/tests/persistent_volume_tests.cpp          |  2 +-
 src/tests/slave_recovery_tests.cpp             | 26 ++++++++++-----------
 src/tests/slave_tests.cpp                      |  2 +-
 6 files changed, 20 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/044196ff/src/tests/containerizer/port_mapping_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/port_mapping_tests.cpp b/src/tests/containerizer/port_mapping_tests.cpp
index a528382..d062f2f 100644
--- a/src/tests/containerizer/port_mapping_tests.cpp
+++ b/src/tests/containerizer/port_mapping_tests.cpp
@@ -1962,7 +1962,7 @@ TEST_F(PortMappingMesosTest, CGROUPS_ROOT_RecoverMixedContainers)
   AWAIT_READY(_recover1);
 
   Clock::settle(); // Wait for slave to schedule reregister timeout.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(slaveFlags.executor_reregistration_timeout);
 
   AWAIT_READY(slaveReregisteredMessage1);
 
@@ -2011,7 +2011,7 @@ TEST_F(PortMappingMesosTest, CGROUPS_ROOT_RecoverMixedContainers)
   AWAIT_READY(_recover2);
 
   Clock::settle(); // Wait for slave to schedule reregister timeout.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(slaveFlags.executor_reregistration_timeout);
 
   AWAIT_READY(slaveReregisteredMessage2);
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/044196ff/src/tests/disk_quota_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/disk_quota_tests.cpp b/src/tests/disk_quota_tests.cpp
index f758bee..9fe2c86 100644
--- a/src/tests/disk_quota_tests.cpp
+++ b/src/tests/disk_quota_tests.cpp
@@ -701,7 +701,7 @@ TEST_F(DiskQuotaTest, SlaveRecovery)
   AWAIT_READY(reregisterExecutorMessage);
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(slave::EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
 
   // NOTE: We resume the clock because we need the reaper to reap the
   // 'du' subprocess.

http://git-wip-us.apache.org/repos/asf/mesos/blob/044196ff/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 754f7bd..1dfe5fd 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -6442,7 +6442,7 @@ TEST_F(MasterTest, AgentRestartNoReregister)
 
   // The agent waits for the executor reregister timeout to expire,
   // even if all executors have re-reregistered.
-  Clock::advance(slave::EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(agentFlags.executor_reregistration_timeout);
   Clock::settle();
 
   // Agent will try to re-register after completing recovery; prevent
@@ -6473,7 +6473,7 @@ TEST_F(MasterTest, AgentRestartNoReregister)
     .WillOnce(FutureArg<1>(&unreachableStatus));
 
   Duration elapsedTime =
-    masterFlags.agent_ping_timeout + slave::EXECUTOR_REREGISTER_TIMEOUT;
+    masterFlags.agent_ping_timeout + agentFlags.executor_reregistration_timeout;
 
   Duration remainingReregisterTime =
     masterFlags.agent_reregister_timeout - elapsedTime;

http://git-wip-us.apache.org/repos/asf/mesos/blob/044196ff/src/tests/persistent_volume_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/persistent_volume_tests.cpp b/src/tests/persistent_volume_tests.cpp
index ef00853..ce716bc 100644
--- a/src/tests/persistent_volume_tests.cpp
+++ b/src/tests/persistent_volume_tests.cpp
@@ -1955,7 +1955,7 @@ TEST_P(PersistentVolumeTest, SlaveRecovery)
   Clock::settle();
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(slave::EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(slaveFlags.executor_reregistration_timeout);
 
   Clock::resume();
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/044196ff/src/tests/slave_recovery_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_recovery_tests.cpp b/src/tests/slave_recovery_tests.cpp
index 52e78b6..0aa87f5 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -889,7 +889,7 @@ TYPED_TEST(SlaveRecoveryTest, DISABLED_RecoverUnregisteredHTTPExecutor)
   Clock::settle(); // Wait for slave to schedule reregister timeout.
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
 
   // Now advance time until the reaper reaps the executor.
   while (status.isPending()) {
@@ -1001,7 +1001,7 @@ TYPED_TEST(SlaveRecoveryTest, RecoverUnregisteredExecutor)
   Clock::settle(); // Wait for slave to schedule reregister timeout.
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
 
   // Now advance time until the reaper reaps the executor.
   while (status.isPending()) {
@@ -1119,7 +1119,7 @@ TYPED_TEST(SlaveRecoveryTest, KillTaskUnregisteredExecutor)
   Clock::settle(); // Wait for the agent to schedule reregister timeout.
 
   // Ensure the agent considers itself recovered.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
 
   while(executorTerminated.isPending()) {
     Clock::advance(process::MAX_REAP_INTERVAL());
@@ -1256,7 +1256,7 @@ TYPED_TEST(SlaveRecoveryTest, RecoverTerminatedHTTPExecutor)
   Clock::settle(); // Wait for slave to schedule reregister timeout.
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
 
   // Now advance time until the reaper reaps the executor.
   while (status.isPending()) {
@@ -1376,7 +1376,7 @@ TYPED_TEST(SlaveRecoveryTest, RecoverTerminatedExecutor)
   Clock::settle(); // Wait for slave to schedule reregister timeout.
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
 
   // Now advance time until the reaper reaps the executor.
   while (status.isPending()) {
@@ -1493,7 +1493,7 @@ TYPED_TEST(SlaveRecoveryTest, DISABLED_RecoveryTimeout)
   AWAIT_READY(_recover);
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
   Clock::resume();
 
   // Scheduler should receive the TASK_FAILED update.
@@ -2206,7 +2206,7 @@ TYPED_TEST(SlaveRecoveryTest, KillTask)
   AWAIT_READY(reregisterExecutorMessage);
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
   Clock::resume();
 
   // Wait for the slave to re-register.
@@ -2497,7 +2497,7 @@ TYPED_TEST(SlaveRecoveryTest, GCExecutor)
   Clock::settle(); // Wait for slave to schedule reregister timeout.
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
 
   AWAIT_READY(slaveReregisteredMessage);
 
@@ -3388,7 +3388,7 @@ TYPED_TEST(SlaveRecoveryTest, SchedulerFailover)
   AWAIT_READY(reregisterExecutorMessage);
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
   Clock::resume();
 
   // Wait for the slave to re-register.
@@ -3534,7 +3534,7 @@ TYPED_TEST(SlaveRecoveryTest, MasterFailover)
   AWAIT_READY(reregisterExecutorMessage);
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
   Clock::resume();
 
   // Wait for the slave to re-register.
@@ -3693,7 +3693,7 @@ TYPED_TEST(SlaveRecoveryTest, MultipleFrameworks)
   AWAIT_READY(reregisterExecutorMessage2);
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
   Clock::resume();
 
   // Wait for the slave to re-register.
@@ -3869,7 +3869,7 @@ TYPED_TEST(SlaveRecoveryTest, MultipleSlaves)
   AWAIT_READY(reregisterExecutorMessage2);
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags1.executor_reregistration_timeout);
   Clock::resume();
 
   // Wait for the slaves to re-register.
@@ -3990,7 +3990,7 @@ TYPED_TEST(SlaveRecoveryTest, RestartBeforeContainerizerLaunch)
   Clock::settle(); // Wait for slave to schedule reregister timeout.
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
   Clock::resume();
 
   // Scheduler should receive the TASK_FAILED update.

http://git-wip-us.apache.org/repos/asf/mesos/blob/044196ff/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 8f81f77..d124d59 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -5005,7 +5005,7 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(SlaveTest, HTTPSchedulerSlaveRestart)
   Clock::settle();
 
   // Ensure the slave considers itself recovered.
-  Clock::advance(slave::EXECUTOR_REREGISTER_TIMEOUT);
+  Clock::advance(flags.executor_reregistration_timeout);
 
   Clock::resume();