You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@aurora.apache.org by "Stephan Erb (JIRA)" <ji...@apache.org> on 2017/10/31 11:16:00 UTC
[jira] [Updated] (AURORA-1955) thermos should exit on irrecoverable
errors to avoid zombies
[ https://issues.apache.org/jira/browse/AURORA-1955?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Stephan Erb updated AURORA-1955:
--------------------------------
Description:
We found several zombie executors on a cluster. Thermos logs indicate reaching system limits while trying to shutdown(?). Mesos agent is unable to get status of this container from docker daemon (docker inspect fails). Shouldn't thermos exit in such a case?
{code}
22 WARNING: Your kernel does not support swap limit capabilities, memory limited without swap.
23 twitter.common.app debug: Initializing: twitter.common.log (Logging subsystem.)
24 Writing log files to disk in /mnt/mesos/sandbox
25 I1023 19:04:32.261165 7 exec.cpp:162] Version: 1.2.0
26 I1023 19:04:32.264870 42 exec.cpp:237] Executor registered on agent b4fff262-c925-4edf-a2ef-2a5bbe89c42b-S3295
27 Writing log files to disk in /mnt/mesos/sandbox
28 Traceback (most recent call last):
29 File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1 26, in _excepting_run
30 self.__real_run(*args, **kw)
31 File "apache/thermos/monitoring/resource.py", line 243, in run
32 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/event_muxer.py", lin e 79, in wait
33 thread.start()
34 File "/usr/lib/python2.7/threading.py", line 745, in start
35 _start_new_thread(self.__bootstrap, ())
36 thread.error: can't start new thread
37 ERROR] Failed to stop health checkers:
38 ERROR] Traceback (most recent call last):
39 File "apache/aurora/executor/aurora_executor.py", line 209, in _shutdown
40 propagate_deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
41 File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline
42 return deadline(*args, daemon=True, propagate=True, **kw)
43 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6 1, in deadline
44 AnonymousThread().start()
45 File "/usr/lib/python2.7/threading.py", line 745, in start
46 _start_new_thread(self.__bootstrap, ())
47 error: can't start new thread
48
49 ERROR] Failed to stop runner:
50 ERROR] Traceback (most recent call last):
51 File "apache/aurora/executor/aurora_executor.py", line 217, in _shutdown
52 propagate_deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
53 File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline
54 return deadline(*args, daemon=True, propagate=True, **kw)
55 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6 1, in deadline
56 AnonymousThread().start()
57 File "/usr/lib/python2.7/threading.py", line 745, in start
58 _start_new_thread(self.__bootstrap, ())
59 error: can't start new thread
60
61 Traceback (most recent call last):
62 File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1 26, in _excepting_run
63 self.__real_run(*args, **kw)
64 File "apache/aurora/executor/status_manager.py", line 62, in run
65 File "apache/aurora/executor/aurora_executor.py", line 235, in _shutdown
66 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deferred.py", line 5 6, in defer
67 deferred.start()
68 File "/usr/lib/python2.7/threading.py", line 745, in start
69 _start_new_thread(self.__bootstrap, ())
70 thread.error: can't start new thread
{code}
was:
We found several zombie executors on a cluster. Thermos logs indicate reaching system limits while trying to shutdown(?). Mesos agent is unable to get status of this container from docker daemon (docker inspect fails). Shouldn't thermos exit in such a case?
22 WARNING: Your kernel does not support swap limit capabilities, memory limited without swap.
23 twitter.common.app debug: Initializing: twitter.common.log (Logging subsystem.)
24 Writing log files to disk in /mnt/mesos/sandbox
25 I1023 19:04:32.261165 7 exec.cpp:162] Version: 1.2.0
26 I1023 19:04:32.264870 42 exec.cpp:237] Executor registered on agent b4fff262-c925-4edf-a2ef-2a5bbe89c42b-S3295
27 Writing log files to disk in /mnt/mesos/sandbox
28 Traceback (most recent call last):
29 File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1 26, in _excepting_run
30 self.__real_run(*args, **kw)
31 File "apache/thermos/monitoring/resource.py", line 243, in run
32 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/event_muxer.py", lin e 79, in wait
33 thread.start()
34 File "/usr/lib/python2.7/threading.py", line 745, in start
35 _start_new_thread(self.__bootstrap, ())
36 thread.error: can't start new thread
37 ERROR] Failed to stop health checkers:
38 ERROR] Traceback (most recent call last):
39 File "apache/aurora/executor/aurora_executor.py", line 209, in _shutdown
40 propagate_deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
41 File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline
42 return deadline(*args, daemon=True, propagate=True, **kw)
43 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6 1, in deadline
44 AnonymousThread().start()
45 File "/usr/lib/python2.7/threading.py", line 745, in start
46 _start_new_thread(self.__bootstrap, ())
47 error: can't start new thread
48
49 ERROR] Failed to stop runner:
50 ERROR] Traceback (most recent call last):
51 File "apache/aurora/executor/aurora_executor.py", line 217, in _shutdown
52 propagate_deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
53 File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline
54 return deadline(*args, daemon=True, propagate=True, **kw)
55 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6 1, in deadline
56 AnonymousThread().start()
57 File "/usr/lib/python2.7/threading.py", line 745, in start
58 _start_new_thread(self.__bootstrap, ())
59 error: can't start new thread
60
61 Traceback (most recent call last):
62 File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1 26, in _excepting_run
63 self.__real_run(*args, **kw)
64 File "apache/aurora/executor/status_manager.py", line 62, in run
65 File "apache/aurora/executor/aurora_executor.py", line 235, in _shutdown
66 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deferred.py", line 5 6, in defer
67 deferred.start()
68 File "/usr/lib/python2.7/threading.py", line 745, in start
69 _start_new_thread(self.__bootstrap, ())
70 thread.error: can't start new thread
> thermos should exit on irrecoverable errors to avoid zombies
> ------------------------------------------------------------
>
> Key: AURORA-1955
> URL: https://issues.apache.org/jira/browse/AURORA-1955
> Project: Aurora
> Issue Type: Bug
> Components: Thermos
> Reporter: Mohit Jaggi
> Assignee: Stephan Erb
> Fix For: 0.18.1
>
>
> We found several zombie executors on a cluster. Thermos logs indicate reaching system limits while trying to shutdown(?). Mesos agent is unable to get status of this container from docker daemon (docker inspect fails). Shouldn't thermos exit in such a case?
> {code}
> 22 WARNING: Your kernel does not support swap limit capabilities, memory limited without swap.
> 23 twitter.common.app debug: Initializing: twitter.common.log (Logging subsystem.)
> 24 Writing log files to disk in /mnt/mesos/sandbox
> 25 I1023 19:04:32.261165 7 exec.cpp:162] Version: 1.2.0
> 26 I1023 19:04:32.264870 42 exec.cpp:237] Executor registered on agent b4fff262-c925-4edf-a2ef-2a5bbe89c42b-S3295
> 27 Writing log files to disk in /mnt/mesos/sandbox
> 28 Traceback (most recent call last):
> 29 File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1 26, in _excepting_run
> 30 self.__real_run(*args, **kw)
> 31 File "apache/thermos/monitoring/resource.py", line 243, in run
> 32 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/event_muxer.py", lin e 79, in wait
> 33 thread.start()
> 34 File "/usr/lib/python2.7/threading.py", line 745, in start
> 35 _start_new_thread(self.__bootstrap, ())
> 36 thread.error: can't start new thread
> 37 ERROR] Failed to stop health checkers:
> 38 ERROR] Traceback (most recent call last):
> 39 File "apache/aurora/executor/aurora_executor.py", line 209, in _shutdown
> 40 propagate_deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
> 41 File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline
> 42 return deadline(*args, daemon=True, propagate=True, **kw)
> 43 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6 1, in deadline
> 44 AnonymousThread().start()
> 45 File "/usr/lib/python2.7/threading.py", line 745, in start
> 46 _start_new_thread(self.__bootstrap, ())
> 47 error: can't start new thread
> 48
> 49 ERROR] Failed to stop runner:
> 50 ERROR] Traceback (most recent call last):
> 51 File "apache/aurora/executor/aurora_executor.py", line 217, in _shutdown
> 52 propagate_deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
> 53 File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline
> 54 return deadline(*args, daemon=True, propagate=True, **kw)
> 55 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6 1, in deadline
> 56 AnonymousThread().start()
> 57 File "/usr/lib/python2.7/threading.py", line 745, in start
> 58 _start_new_thread(self.__bootstrap, ())
> 59 error: can't start new thread
> 60
> 61 Traceback (most recent call last):
> 62 File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1 26, in _excepting_run
> 63 self.__real_run(*args, **kw)
> 64 File "apache/aurora/executor/status_manager.py", line 62, in run
> 65 File "apache/aurora/executor/aurora_executor.py", line 235, in _shutdown
> 66 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deferred.py", line 5 6, in defer
> 67 deferred.start()
> 68 File "/usr/lib/python2.7/threading.py", line 745, in start
> 69 _start_new_thread(self.__bootstrap, ())
> 70 thread.error: can't start new thread
> {code}
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)