You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@aurora.apache.org by ke...@apache.org on 2014/10/08 20:57:51 UTC

git commit: Don't kill GC Executor after period of inactivity

Repository: incubator-aurora
Updated Branches:
  refs/heads/master 94eeceabd -> f2f0ed861


Don't kill GC Executor after period of inactivity

The GC executor is configured to exit after 15 minutes of
inactivity. This leads to a race where the mesos slave gets a
launchTask message for a GC executor just as the executor has exited,
causing TASK_LOST noise. This also increases the risk that a slave
will lose its GC executor due to the scheduler not being able to find
a slot for it (since GC executors will have a higher churn rate).

Cluster operators will still be able to deploy new versions of the
GC executor as the 24-hour max lifetime limit is still in place. This
patch only removes the inactivity limit.

Testing Done:
./pants src/test/python/apache/aurora/executor:gc_executor

Bugs closed: AURORA-788

Reviewed at https://reviews.apache.org/r/26300/


Project: http://git-wip-us.apache.org/repos/asf/incubator-aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-aurora/commit/f2f0ed86
Tree: http://git-wip-us.apache.org/repos/asf/incubator-aurora/tree/f2f0ed86
Diff: http://git-wip-us.apache.org/repos/asf/incubator-aurora/diff/f2f0ed86

Branch: refs/heads/master
Commit: f2f0ed8615def98e26fcfc3ce580a537204a2a9d
Parents: 94eecea
Author: Kevin Sweeney <ke...@apache.org>
Authored: Wed Oct 8 11:56:59 2014 -0700
Committer: Kevin Sweeney <ke...@apache.org>
Committed: Wed Oct 8 11:57:17 2014 -0700

----------------------------------------------------------------------
 .../apache/aurora/executor/gc_executor.py       |  5 ---
 .../apache/aurora/executor/test_gc_executor.py  | 33 +-------------------
 2 files changed, 1 insertion(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/f2f0ed86/src/main/python/apache/aurora/executor/gc_executor.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/executor/gc_executor.py b/src/main/python/apache/aurora/executor/gc_executor.py
index 788671e..a11feb9 100644
--- a/src/main/python/apache/aurora/executor/gc_executor.py
+++ b/src/main/python/apache/aurora/executor/gc_executor.py
@@ -88,9 +88,6 @@ class ThermosGCExecutor(ExecutorBase, ExceptionalThread, Observable):
   # wait time between checking for new GC events from the slave and/or cleaning orphaned tasks
   POLL_WAIT = Amount(5, Time.MINUTES)
 
-  # maximum amount of time the executor will wait with no tasks before it exits.
-  MAXIMUM_EXECUTOR_WAIT = Amount(15, Time.MINUTES)
-
   # maximum lifetime of this executor.  this is to prevent older GC executor binaries from
   # running forever
   MAXIMUM_EXECUTOR_LIFETIME = Amount(1, Time.DAYS)
@@ -455,8 +452,6 @@ class ThermosGCExecutor(ExecutorBase, ExceptionalThread, Observable):
       now = self._clock.time()
       if now > run_start + self.MAXIMUM_EXECUTOR_LIFETIME.as_(Time.SECONDS):
         return True
-      if now > last_gc_run + self.MAXIMUM_EXECUTOR_WAIT.as_(Time.SECONDS):
-        return True
       return self._stop_event.is_set()
 
     while not should_terminate():

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/f2f0ed86/src/test/python/apache/aurora/executor/test_gc_executor.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/executor/test_gc_executor.py b/src/test/python/apache/aurora/executor/test_gc_executor.py
index 1905fe3..85afe50 100644
--- a/src/test/python/apache/aurora/executor/test_gc_executor.py
+++ b/src/test/python/apache/aurora/executor/test_gc_executor.py
@@ -500,11 +500,8 @@ def test_ignores_launch_task_when_shutting_down():
     assert (mesos_pb2.TASK_FAILED, TASK_ID) == proxy_driver.updates[-1]
 
 
-def make_gc_executor_with_timeouts(
-    maximum_executor_wait=Amount(15, Time.MINUTES),
-    maximum_executor_lifetime=Amount(1, Time.DAYS)):
+def make_gc_executor_with_timeouts(maximum_executor_lifetime=Amount(1, Time.DAYS)):
   class TimeoutGCExecutor(ThinTestThermosGCExecutor):
-    MAXIMUM_EXECUTOR_WAIT = maximum_executor_wait
     MAXIMUM_EXECUTOR_LIFETIME = maximum_executor_lifetime
   return TimeoutGCExecutor
 
@@ -520,34 +517,6 @@ def run_gc_with_timeout(**kw):
     yield (proxy_driver, executor)
 
 
-def test_gc_wait():
-  # run w/ no tasks
-  with run_gc_with_timeout(maximum_executor_wait=Amount(15, Time.SECONDS)) as (
-      proxy_driver, executor):
-    executor._clock.tick(10)
-    proxy_driver.stopped.wait(timeout=0.1)
-    assert not proxy_driver.stopped.is_set()
-    executor._clock.tick(5.1)
-    proxy_driver.stopped.wait(timeout=0.1)
-    assert proxy_driver.stopped.is_set()
-    assert not executor._stop_event.is_set()
-
-  # ensure launchTask restarts executor wait
-  with run_gc_with_timeout(maximum_executor_wait=Amount(15, Time.SECONDS)) as (
-      proxy_driver, executor):
-    executor._clock.tick(10)
-    proxy_driver.stopped.wait(timeout=0.1)
-    assert not proxy_driver.stopped.is_set()
-    executor.launchTask(proxy_driver, serialize_art(AdjustRetainedTasks(retainedTasks={})))
-    executor._clock.tick(5.1)
-    proxy_driver.stopped.wait(timeout=0.1)
-    assert not proxy_driver.stopped.is_set()
-    executor._clock.tick(15.1)
-    proxy_driver.stopped.wait(timeout=0.1)
-    assert proxy_driver.stopped.is_set()
-    assert not executor._stop_event.is_set()
-
-
 def test_gc_lifetime():
   with run_gc_with_timeout(maximum_executor_lifetime=Amount(500, Time.MILLISECONDS)) as (
       proxy_driver, executor):