You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@aurora.apache.org by "Stephan Erb (JIRA)" <ji...@apache.org> on 2017/10/31 11:16:00 UTC

[jira] [Updated] (AURORA-1955) thermos should exit on irrecoverable errors to avoid zombies

     [ https://issues.apache.org/jira/browse/AURORA-1955?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Stephan Erb updated AURORA-1955:
--------------------------------
    Description: 
We found several zombie executors on a cluster. Thermos logs indicate reaching system limits while trying to shutdown(?). Mesos agent is unable to get status of this container from docker daemon (docker inspect fails). Shouldn't thermos exit in such a case?

{code}
 22 WARNING: Your kernel does not support swap limit capabilities, memory limited without swap.
 23 twitter.common.app debug: Initializing: twitter.common.log (Logging subsystem.)
 24 Writing log files to disk in /mnt/mesos/sandbox
 25 I1023 19:04:32.261165     7 exec.cpp:162] Version: 1.2.0
 26 I1023 19:04:32.264870    42 exec.cpp:237] Executor registered on agent b4fff262-c925-4edf-a2ef-2a5bbe89c42b-S3295
 27 Writing log files to disk in /mnt/mesos/sandbox
 28 Traceback (most recent call last):
 29   File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1    26, in _excepting_run
 30     self.__real_run(*args, **kw)
 31   File "apache/thermos/monitoring/resource.py", line 243, in run
 32   File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/event_muxer.py", lin    e 79, in wait
 33     thread.start()
 34   File "/usr/lib/python2.7/threading.py", line 745, in start
 35     _start_new_thread(self.__bootstrap, ())
 36 thread.error: can't start new thread
 37 ERROR] Failed to stop health checkers:
 38 ERROR] Traceback (most recent call last):
 39   File "apache/aurora/executor/aurora_executor.py", line 209, in _shutdown
 40     propagate_deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
 41   File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline
 42     return deadline(*args, daemon=True, propagate=True, **kw)
 43   File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6    1, in deadline
 44     AnonymousThread().start()
 45   File "/usr/lib/python2.7/threading.py", line 745, in start
 46     _start_new_thread(self.__bootstrap, ())
 47 error: can't start new thread
48
 49 ERROR] Failed to stop runner:
50 ERROR] Traceback (most recent call last):
 51   File "apache/aurora/executor/aurora_executor.py", line 217, in _shutdown
 52     propagate_deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
 53   File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline
 54     return deadline(*args, daemon=True, propagate=True, **kw)
 55   File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6    1, in deadline
 56     AnonymousThread().start()
 57   File "/usr/lib/python2.7/threading.py", line 745, in start
 58     _start_new_thread(self.__bootstrap, ())
 59 error: can't start new thread
 60
 61 Traceback (most recent call last):
 62   File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1    26, in _excepting_run
 63     self.__real_run(*args, **kw)
 64   File "apache/aurora/executor/status_manager.py", line 62, in run
 65   File "apache/aurora/executor/aurora_executor.py", line 235, in _shutdown
 66   File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deferred.py", line 5    6, in defer
 67     deferred.start()
 68   File "/usr/lib/python2.7/threading.py", line 745, in start
 69     _start_new_thread(self.__bootstrap, ())
 70 thread.error: can't start new thread
{code}

  was:
We found several zombie executors on a cluster. Thermos logs indicate reaching system limits while trying to shutdown(?). Mesos agent is unable to get status of this container from docker daemon (docker inspect fails). Shouldn't thermos exit in such a case?


 22 WARNING: Your kernel does not support swap limit capabilities, memory limited without swap.
 23 twitter.common.app debug: Initializing: twitter.common.log (Logging subsystem.)
 24 Writing log files to disk in /mnt/mesos/sandbox
 25 I1023 19:04:32.261165     7 exec.cpp:162] Version: 1.2.0
 26 I1023 19:04:32.264870    42 exec.cpp:237] Executor registered on agent b4fff262-c925-4edf-a2ef-2a5bbe89c42b-S3295
 27 Writing log files to disk in /mnt/mesos/sandbox
 28 Traceback (most recent call last):
 29   File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1    26, in _excepting_run
 30     self.__real_run(*args, **kw)
 31   File "apache/thermos/monitoring/resource.py", line 243, in run
 32   File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/event_muxer.py", lin    e 79, in wait
 33     thread.start()
 34   File "/usr/lib/python2.7/threading.py", line 745, in start
 35     _start_new_thread(self.__bootstrap, ())
 36 thread.error: can't start new thread
 37 ERROR] Failed to stop health checkers:
 38 ERROR] Traceback (most recent call last):
 39   File "apache/aurora/executor/aurora_executor.py", line 209, in _shutdown
 40     propagate_deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
 41   File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline
 42     return deadline(*args, daemon=True, propagate=True, **kw)
 43   File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6    1, in deadline
 44     AnonymousThread().start()
 45   File "/usr/lib/python2.7/threading.py", line 745, in start
 46     _start_new_thread(self.__bootstrap, ())
 47 error: can't start new thread
48
 49 ERROR] Failed to stop runner:
50 ERROR] Traceback (most recent call last):
 51   File "apache/aurora/executor/aurora_executor.py", line 217, in _shutdown
 52     propagate_deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
 53   File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline
 54     return deadline(*args, daemon=True, propagate=True, **kw)
 55   File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6    1, in deadline
 56     AnonymousThread().start()
 57   File "/usr/lib/python2.7/threading.py", line 745, in start
 58     _start_new_thread(self.__bootstrap, ())
 59 error: can't start new thread
 60
 61 Traceback (most recent call last):
 62   File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1    26, in _excepting_run
 63     self.__real_run(*args, **kw)
 64   File "apache/aurora/executor/status_manager.py", line 62, in run
 65   File "apache/aurora/executor/aurora_executor.py", line 235, in _shutdown
 66   File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deferred.py", line 5    6, in defer
 67     deferred.start()
 68   File "/usr/lib/python2.7/threading.py", line 745, in start
 69     _start_new_thread(self.__bootstrap, ())
 70 thread.error: can't start new thread


> thermos should exit on irrecoverable errors to avoid zombies
> ------------------------------------------------------------
>
>                 Key: AURORA-1955
>                 URL: https://issues.apache.org/jira/browse/AURORA-1955
>             Project: Aurora
>          Issue Type: Bug
>          Components: Thermos
>            Reporter: Mohit Jaggi
>            Assignee: Stephan Erb
>             Fix For: 0.18.1
>
>
> We found several zombie executors on a cluster. Thermos logs indicate reaching system limits while trying to shutdown(?). Mesos agent is unable to get status of this container from docker daemon (docker inspect fails). Shouldn't thermos exit in such a case?
> {code}
>  22 WARNING: Your kernel does not support swap limit capabilities, memory limited without swap.
>  23 twitter.common.app debug: Initializing: twitter.common.log (Logging subsystem.)
>  24 Writing log files to disk in /mnt/mesos/sandbox
>  25 I1023 19:04:32.261165     7 exec.cpp:162] Version: 1.2.0
>  26 I1023 19:04:32.264870    42 exec.cpp:237] Executor registered on agent b4fff262-c925-4edf-a2ef-2a5bbe89c42b-S3295
>  27 Writing log files to disk in /mnt/mesos/sandbox
>  28 Traceback (most recent call last):
>  29   File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1    26, in _excepting_run
>  30     self.__real_run(*args, **kw)
>  31   File "apache/thermos/monitoring/resource.py", line 243, in run
>  32   File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/event_muxer.py", lin    e 79, in wait
>  33     thread.start()
>  34   File "/usr/lib/python2.7/threading.py", line 745, in start
>  35     _start_new_thread(self.__bootstrap, ())
>  36 thread.error: can't start new thread
>  37 ERROR] Failed to stop health checkers:
>  38 ERROR] Traceback (most recent call last):
>  39   File "apache/aurora/executor/aurora_executor.py", line 209, in _shutdown
>  40     propagate_deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
>  41   File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline
>  42     return deadline(*args, daemon=True, propagate=True, **kw)
>  43   File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6    1, in deadline
>  44     AnonymousThread().start()
>  45   File "/usr/lib/python2.7/threading.py", line 745, in start
>  46     _start_new_thread(self.__bootstrap, ())
>  47 error: can't start new thread
> 48
>  49 ERROR] Failed to stop runner:
> 50 ERROR] Traceback (most recent call last):
>  51   File "apache/aurora/executor/aurora_executor.py", line 217, in _shutdown
>  52     propagate_deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
>  53   File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline
>  54     return deadline(*args, daemon=True, propagate=True, **kw)
>  55   File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6    1, in deadline
>  56     AnonymousThread().start()
>  57   File "/usr/lib/python2.7/threading.py", line 745, in start
>  58     _start_new_thread(self.__bootstrap, ())
>  59 error: can't start new thread
>  60
>  61 Traceback (most recent call last):
>  62   File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1    26, in _excepting_run
>  63     self.__real_run(*args, **kw)
>  64   File "apache/aurora/executor/status_manager.py", line 62, in run
>  65   File "apache/aurora/executor/aurora_executor.py", line 235, in _shutdown
>  66   File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deferred.py", line 5    6, in defer
>  67     deferred.start()
>  68   File "/usr/lib/python2.7/threading.py", line 745, in start
>  69     _start_new_thread(self.__bootstrap, ())
>  70 thread.error: can't start new thread
> {code}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)