You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by "Jarek Potiuk (Jira)" <ji...@apache.org> on 2020/01/06 23:08:00 UTC

[jira] [Resolved] (AIRFLOW-6360) config option to skip task_stats from getting completed dagruns/tis

     [ https://issues.apache.org/jira/browse/AIRFLOW-6360?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Jarek Potiuk resolved AIRFLOW-6360.
-----------------------------------
    Fix Version/s: 2.0.0
       Resolution: Fixed

> config option to skip task_stats from getting completed dagruns/tis
> -------------------------------------------------------------------
>
>                 Key: AIRFLOW-6360
>                 URL: https://issues.apache.org/jira/browse/AIRFLOW-6360
>             Project: Apache Airflow
>          Issue Type: Improvement
>          Components: ui
>    Affects Versions: 1.10.6
>            Reporter: t oo
>            Assignee: t oo
>            Priority: Major
>             Fix For: 2.0.0
>
>
> task_stats endpoint to display 'recent tasks' is very slow when someone has many dagruns or many tasks in a dag.
> BEFORE
>  LastDagRun = (
>             session.query(DagRun.dag_id, sqla.func.max(DagRun.execution_date).label('execution_date'))
>                 .join(Dag, Dag.dag_id == DagRun.dag_id)
>                 .filter(DagRun.state != State.RUNNING)
>                 .filter(Dag.is_active == True)  # noqa: E712
>                 .filter(Dag.is_subdag == False)  # noqa: E712
>                 .group_by(DagRun.dag_id)
>                 .subquery('last_dag_run')
>         )
>         RunningDagRun = (
>             session.query(DagRun.dag_id, DagRun.execution_date)
>                 .join(Dag, Dag.dag_id == DagRun.dag_id)
>                 .filter(DagRun.state == State.RUNNING)
>                 .filter(Dag.is_active == True)  # noqa: E712
>                 .filter(Dag.is_subdag == False)  # noqa: E712
>                 .subquery('running_dag_run')
>         )
>         # Select all task_instances from active dag_runs.
>         # If no dag_run is active, return task instances from most recent dag_run.
>         LastTI = (
>             session.query(TI.dag_id.label('dag_id'), TI.state.label('state'))
>             .join(LastDagRun, and_(
>                 LastDagRun.c.dag_id == TI.dag_id,
>                 LastDagRun.c.execution_date == TI.execution_date))
>         )
>         RunningTI = (
>             session.query(TI.dag_id.label('dag_id'), TI.state.label('state'))
>             .join(RunningDagRun, and_(
>                 RunningDagRun.c.dag_id == TI.dag_id,
>                 RunningDagRun.c.execution_date == TI.execution_date))
>         )
>         UnionTI = union_all(LastTI, RunningTI).alias('union_ti')
>         qry = (
>             session.query(UnionTI.c.dag_id, UnionTI.c.state, sqla.func.count())
>             .group_by(UnionTI.c.dag_id, UnionTI.c.state)
>         )
> AFTER
> #we not interested in stats for dagruns already completed, only want active ones
>         RunningDagRun = (
>             session.query(DagRun.dag_id, DagRun.execution_date)
>                 .join(Dag, Dag.dag_id == DagRun.dag_id)
>                 .filter(DagRun.state == State.RUNNING,
>                 Dag.is_active,
>                 Dag.is_subdag == False)  # noqa: E712
>                 .subquery('running_dag_run')
>         )
>         # Select all task_instances from active dag_runs.
>         qry = (
>             session.query(TI.dag_id.label('dag_id'), TI.state.label('state'), sqla.func.count())
>             .join(RunningDagRun, and_(
>                 RunningDagRun.c.dag_id == TI.dag_id,
>                 RunningDagRun.c.execution_date == TI.execution_date))
>             .group_by(TI.dag_id, TI.state)
>         )



--
This message was sent by Atlassian Jira
(v8.3.4#803005)