You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by "Teresa Martyny (JIRA)" <ji...@apache.org> on 2019/04/30 23:41:00 UTC

[jira] [Commented] (AIRFLOW-4134) "DB connection invalidated" warning raised every JOB_HEARTBEAT_SEC

    [ https://issues.apache.org/jira/browse/AIRFLOW-4134?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16830759#comment-16830759 ] 

Teresa Martyny commented on AIRFLOW-4134:
-----------------------------------------

This is still happening in 1.10.3 as well.

> "DB connection invalidated" warning raised every JOB_HEARTBEAT_SEC
> ------------------------------------------------------------------
>
>                 Key: AIRFLOW-4134
>                 URL: https://issues.apache.org/jira/browse/AIRFLOW-4134
>             Project: Apache Airflow
>          Issue Type: Bug
>    Affects Versions: 1.10.2
>            Reporter: Daniel Standish
>            Priority: Major
>
> I am finding with 1.10.2 that I seem to get a warning {{DB connection invalidated. Reconnecting...}} very frequently.  It seems to coincide closely with my job_heartbeat_sec parameter (5 seconds).
> I have tried to diagnose I added logging of the triggering error on line 79 in airflow/utils/sqlalchemy.py, from which this warning is generated.
> Looks like it is related to zombie check.
> Call stack:
> {code}
> Call stack:
>   File "/usr/local/bin/airflow", line 32, in <module>
>     args.func(args)
>   File "/usr/local/lib/python3.6/site-packages/airflow/utils/cli.py", line 74, in wrapper
>     return f(*args, **kwargs)
>   File "/usr/local/lib/python3.6/site-packages/airflow/bin/cli.py", line 992, in scheduler
>     job.run()
>   File "/usr/local/lib/python3.6/site-packages/airflow/jobs.py", line 205, in run
>     self._execute()
>   File "/usr/local/lib/python3.6/site-packages/airflow/jobs.py", line 1532, in _execute
>     self._execute_helper()
>   File "/usr/local/lib/python3.6/site-packages/airflow/jobs.py", line 1562, in _execute_helper
>     self.processor_agent.start()
>   File "/usr/local/lib/python3.6/site-packages/airflow/utils/dag_processing.py", line 511, in start
>     self._async_mode)
>   File "/usr/local/lib/python3.6/site-packages/airflow/utils/dag_processing.py", line 565, in _launch_process
>     p.start()
>   File "/usr/local/lib/python3.6/multiprocessing/process.py", line 105, in start
>     self._popen = self._Popen(self)
>   File "/usr/local/lib/python3.6/multiprocessing/context.py", line 223, in _Popen
>     return _default_context.get_context().Process._Popen(process_obj)
>   File "/usr/local/lib/python3.6/multiprocessing/context.py", line 277, in _Popen
>     return Popen(process_obj)
>   File "/usr/local/lib/python3.6/multiprocessing/popen_fork.py", line 19, in __init__
>     self._launch(process_obj)
>   File "/usr/local/lib/python3.6/multiprocessing/popen_fork.py", line 73, in _launch
>     code = process_obj._bootstrap()
>   File "/usr/local/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
>     self.run()
>   File "/usr/local/lib/python3.6/multiprocessing/process.py", line 93, in run
>     self._target(*self._args, **self._kwargs)
>   File "/usr/local/lib/python3.6/site-packages/airflow/utils/dag_processing.py", line 560, in helper
>     processor_manager.start()
>   File "/usr/local/lib/python3.6/site-packages/airflow/utils/dag_processing.py", line 797, in start
>     self.start_in_async()
>   File "/usr/local/lib/python3.6/site-packages/airflow/utils/dag_processing.py", line 820, in start_in_async
>     simple_dags = self.heartbeat()
>   File "/usr/local/lib/python3.6/site-packages/airflow/utils/dag_processing.py", line 1190, in heartbeat
>     zombies = self._find_zombies()
>   File "/usr/local/lib/python3.6/site-packages/airflow/utils/db.py", line 73, in wrapper
>     return func(*args, **kwargs)
>   File "/usr/local/lib/python3.6/site-packages/airflow/utils/dag_processing.py", line 1236, in _find_zombies
>     LJ.latest_heartbeat < limit_dttm,
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/query.py", line 2925, in all
>     return list(self)
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/query.py", line 3081, in __iter__
>     return self._execute_and_instances(context)
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/query.py", line 3103, in _execute_and_instances
>     querycontext, self._connection_from_session, close_with_result=True
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/query.py", line 3111, in _get_bind_args
>     mapper=self._bind_mapper(), clause=querycontext.statement, **kw
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/query.py", line 3096, in _connection_from_session
>     conn = self.session.connection(**kw)
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/session.py", line 1120, in connection
>     execution_options=execution_options,
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/session.py", line 1126, in _connection_for_bind
>     engine, execution_options
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/session.py", line 424, in _connection_for_bind
>     conn = bind.contextual_connect()
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 2194, in contextual_connect
>     **kwargs
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 125, in __init__
>     self.dispatch.engine_connect(self, self.__branch)
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/event/attr.py", line 297, in __call__
>     fn(*args, **kw)
>   File "/usr/local/lib/python3.6/site-packages/airflow/utils/sqlalchemy.py", line 79, in ping_connection
>     log.warning("DB connection invalidated. Reconnecting...", err)
> Message: 'DB connection invalidated. Reconnecting...'
> Arguments: (OperationalError('(psycopg2.OperationalError) server closed the connection unexpectedly\n\tThis probably means the server terminated abnormally\n\tbefore or while processing the request.\n',),)
> {code}
> Traceback:
> {code}
> Traceback (most recent call last):
>   File "/usr/local/lib/python3.6/site-packages/airflow/utils/sqlalchemy.py", line 68, in ping_connection
>     connection.scalar(select([1]))
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 912, in scalar
>     return self.execute(object_, *multiparams, **params).scalar()
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 980, in execute
>     return meth(self, multiparams, params)
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/sql/elements.py", line 273, in _execute_on_connection
>     return connection._execute_clauseelement(self, multiparams, params)
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 1099, in _execute_clauseelement
>     distilled_params,
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 1240, in _execute_context
>     e, statement, parameters, cursor, context
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 1458, in _handle_dbapi_exception
>     util.raise_from_cause(sqlalchemy_exception, exc_info)
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/util/compat.py", line 296, in raise_from_cause
>     reraise(type(exception), exception, tb=exc_tb, cause=cause)
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/util/compat.py", line 276, in reraise
>     raise value.with_traceback(tb)
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 1236, in _execute_context
>     cursor, statement, parameters, context
>   File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/default.py", line 536, in do_execute
>     cursor.execute(statement, parameters)
> sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) server closed the connection unexpectedly
> 	This probably means the server terminated abnormally
> 	before or while processing the request.
>  [SQL: 'SELECT 1'] (Background on this error at: http://sqlalche.me/e/e3q8)
> {code}
> If I disable connection pooling, then the warning seems to go away.
> Beyond that, I am not sure where to go from here.  But something must be wrong.  
> Currently running with a variation of puckel/docker, but seems to happen on every setup with 1.10.2.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)