You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by ma...@apache.org on 2016/06/05 05:23:55 UTC

[05/34] incubator-airflow-site git commit: Initial commit

http://git-wip-us.apache.org/repos/asf/incubator-airflow-site/blob/9e19165c/concepts.html
----------------------------------------------------------------------
diff --git a/concepts.html b/concepts.html
new file mode 100644
index 0000000..871c608
--- /dev/null
+++ b/concepts.html
@@ -0,0 +1,897 @@
+
+
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  
+  <title>Concepts &mdash; Airflow Documentation</title>
+  
+
+  
+  
+
+  
+
+  
+  
+    
+
+  
+
+  
+  
+    <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+  
+
+  
+
+  
+    <link rel="top" title="Airflow Documentation" href="index.html"/>
+        <link rel="next" title="Data Profiling" href="profiling.html"/>
+        <link rel="prev" title="UI / Screenshots" href="ui.html"/> 
+
+  
+  <script src="_static/js/modernizr.min.js"></script>
+
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+
+    
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
+      <div class="wy-side-scroll">
+        <div class="wy-side-nav-search">
+          
+
+          
+            <a href="index.html" class="icon icon-home"> Airflow
+          
+
+          
+          </a>
+
+          
+            
+            
+          
+
+          
+<div role="search">
+  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+    <input type="hidden" name="check_keywords" value="yes" />
+    <input type="hidden" name="area" value="default" />
+  </form>
+</div>
+
+          
+        </div>
+
+        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
+          
+            
+            
+                <ul class="current">
+<li class="toctree-l1"><a class="reference internal" href="project.html">Project</a></li>
+<li class="toctree-l1"><a class="reference internal" href="license.html">License</a></li>
+<li class="toctree-l1"><a class="reference internal" href="start.html">Quick Start</a></li>
+<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="tutorial.html">Tutorial</a></li>
+<li class="toctree-l1"><a class="reference internal" href="configuration.html">Configuration</a></li>
+<li class="toctree-l1"><a class="reference internal" href="ui.html">UI / Screenshots</a></li>
+<li class="toctree-l1 current"><a class="current reference internal" href="#">Concepts</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="#core-ideas">Core Ideas</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="#dags">DAGs</a><ul>
+<li class="toctree-l4"><a class="reference internal" href="#scope">Scope</a></li>
+<li class="toctree-l4"><a class="reference internal" href="#default-arguments">Default Arguments</a></li>
+<li class="toctree-l4"><a class="reference internal" href="#context-manager">Context Manager</a></li>
+</ul>
+</li>
+<li class="toctree-l3"><a class="reference internal" href="#operators">Operators</a><ul>
+<li class="toctree-l4"><a class="reference internal" href="#dag-assignment">DAG Assignment</a></li>
+<li class="toctree-l4"><a class="reference internal" href="#bitshift-composition">Bitshift Composition</a></li>
+</ul>
+</li>
+<li class="toctree-l3"><a class="reference internal" href="#tasks">Tasks</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#task-instances">Task Instances</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#workflows">Workflows</a></li>
+</ul>
+</li>
+<li class="toctree-l2"><a class="reference internal" href="#additional-functionality">Additional Functionality</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="#hooks">Hooks</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#pools">Pools</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#connections">Connections</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#queues">Queues</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#xcoms">XComs</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#variables">Variables</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#branching">Branching</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#subdags">SubDAGs</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#slas">SLAs</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#trigger-rules">Trigger Rules</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#zombies-undeads">Zombies &amp; Undeads</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#cluster-policy">Cluster Policy</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#documentation-notes">Documentation &amp; Notes</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#jinja-templating">Jinja Templating</a></li>
+</ul>
+</li>
+<li class="toctree-l2"><a class="reference internal" href="#packaged-dags">Packaged dags</a></li>
+</ul>
+</li>
+<li class="toctree-l1"><a class="reference internal" href="profiling.html">Data Profiling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="cli.html">Command Line Interface</a></li>
+<li class="toctree-l1"><a class="reference internal" href="scheduler.html">Scheduling &amp; Triggers</a></li>
+<li class="toctree-l1"><a class="reference internal" href="plugins.html">Plugins</a></li>
+<li class="toctree-l1"><a class="reference internal" href="security.html">Security</a></li>
+<li class="toctree-l1"><a class="reference internal" href="faq.html">FAQ</a></li>
+<li class="toctree-l1"><a class="reference internal" href="code.html">API Reference</a></li>
+</ul>
+
+            
+          
+        </div>
+      </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+      
+      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
+        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+        <a href="index.html">Airflow</a>
+      </nav>
+
+
+      
+      <div class="wy-nav-content">
+        <div class="rst-content">
+          
+
+ 
+
+
+
+<div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="index.html">Docs</a> &raquo;</li>
+      
+    <li>Concepts</li>
+      <li class="wy-breadcrumbs-aside">
+        
+          
+            <a href="_sources/concepts.txt" rel="nofollow"> View page source</a>
+          
+        
+      </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+            
+  <div class="section" id="concepts">
+<h1>Concepts<a class="headerlink" href="#concepts" title="Permalink to this headline">�</a></h1>
+<p>The Airflow Platform is a tool for describing, executing, and monitoring
+workflows.</p>
+<div class="section" id="core-ideas">
+<h2>Core Ideas<a class="headerlink" href="#core-ideas" title="Permalink to this headline">�</a></h2>
+<div class="section" id="dags">
+<h3>DAGs<a class="headerlink" href="#dags" title="Permalink to this headline">�</a></h3>
+<p>In Airflow, a <code class="docutils literal"><span class="pre">DAG</span></code> &#8211; or a Directed Acyclic Graph &#8211; is a collection of all
+the tasks you want to run, organized in a way that reflects their relationships
+and dependencies.</p>
+<p>For example, a simple DAG could consist of three tasks: A, B, and C. It could
+say that A has to run successfully before B can run, but C can run anytime. It
+could say that task A times out after 5 minutes, and B can be restarted up to 5
+times in case it fails. It might also say that the workflow will run every night
+at 10pm, but shouldn&#8217;t start until a certain date.</p>
+<p>In this way, a DAG describes <em>how</em> you want to carry out your workflow; but
+notice that we haven&#8217;t said anything about <em>what</em> we actually want to do! A, B,
+and C could be anything. Maybe A prepares data for B to analyze while C sends an
+email. Or perhaps A monitors your location so B can open your garage door while
+C turns on your house lights. The important thing is that the DAG isn&#8217;t
+concerned with what its constituent tasks do; its job is to make sure that
+whatever they do happens at the right time, or in the right order, or with the
+right handling of any unexpected issues.</p>
+<p>DAGs are defined in standard Python files that are placed in Airflow&#8217;s
+<code class="docutils literal"><span class="pre">DAG_FOLDER</span></code>. Airflow will execute the code in each file to dynamically build
+the <code class="docutils literal"><span class="pre">DAG</span></code> objects. You can have as many DAGs as you want, each describing an
+arbitrary number of tasks. In general, each one should correspond to a single
+logical workflow.</p>
+<div class="section" id="scope">
+<h4>Scope<a class="headerlink" href="#scope" title="Permalink to this headline">�</a></h4>
+<p>Airflow will load any <code class="docutils literal"><span class="pre">DAG</span></code> object it can import from a DAGfile. Critically,
+that means the DAG must appear in <code class="docutils literal"><span class="pre">globals()</span></code>. Consider the following two
+DAGs. Only <code class="docutils literal"><span class="pre">dag_1</span></code> will be loaded; the other one only appears in a local
+scope.</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="n">dag_1</span> <span class="o">=</span> <span class="n">DAG</span><span class="p">(</span><span class="s1">&#39;this_dag_will_be_discovered&#39;</span><span class="p">)</span>
+
+<span class="k">def</span> <span class="nf">my_function</span><span class="p">()</span>
+    <span class="n">dag_2</span> <span class="o">=</span> <span class="n">DAG</span><span class="p">(</span><span class="s1">&#39;but_this_dag_will_not&#39;</span><span class="p">)</span>
+
+<span class="n">my_function</span><span class="p">()</span>
+</pre></div>
+</div>
+<p>Sometimes this can be put to good use. For example, a common pattern with
+<code class="docutils literal"><span class="pre">SubDagOperator</span></code> is to define the subdag inside a function so that Airflow
+doesn&#8217;t try to load it as a standalone DAG.</p>
+</div>
+<div class="section" id="default-arguments">
+<h4>Default Arguments<a class="headerlink" href="#default-arguments" title="Permalink to this headline">�</a></h4>
+<p>If a dictionary of <code class="docutils literal"><span class="pre">default_args</span></code> is passed to a DAG, it will apply them to
+any of its operators. This makes it easy to apply a common parameter to many operators without having to type it many times.</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="n">default_args</span><span class="o">=</span><span class="nb">dict</span><span class="p">(</span>
+    <span class="n">start_date</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2016</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span>
+    <span class="n">owner</span><span class="o">=</span><span class="s1">&#39;Airflow&#39;</span><span class="p">)</span>
+
+<span class="n">dag</span> <span class="o">=</span> <span class="n">DAG</span><span class="p">(</span><span class="s1">&#39;my_dag&#39;</span><span class="p">,</span> <span class="n">default_args</span><span class="o">=</span><span class="n">default_args</span><span class="p">)</span>
+<span class="n">op</span> <span class="o">=</span> <span class="n">DummyOperator</span><span class="p">(</span><span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;dummy&#39;</span><span class="p">,</span> <span class="n">dag</span><span class="o">=</span><span class="n">dag</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">owner</span><span class="p">)</span> <span class="c1"># Airflow</span>
+</pre></div>
+</div>
+</div>
+<div class="section" id="context-manager">
+<h4>Context Manager<a class="headerlink" href="#context-manager" title="Permalink to this headline">�</a></h4>
+<p><em>Added in Airflow 1.8</em></p>
+<p>DAGs can be used as context managers to automatically assign new operators to that DAG.</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="k">with</span> <span class="n">DAG</span><span class="p">(</span><span class="s1">&#39;my_dag&#39;</span><span class="p">,</span> <span class="n">start_date</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2016</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span> <span class="k">as</span> <span class="n">dag</span><span class="p">:</span>
+    <span class="n">op</span> <span class="o">=</span> <span class="n">DummyOperator</span><span class="p">(</span><span class="s1">&#39;op&#39;</span><span class="p">)</span>
+
+<span class="n">op</span><span class="o">.</span><span class="n">dag</span> <span class="ow">is</span> <span class="n">dag</span> <span class="c1"># True</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="section" id="operators">
+<h3>Operators<a class="headerlink" href="#operators" title="Permalink to this headline">�</a></h3>
+<p>While DAGs describe <em>how</em> to run a workflow, <code class="docutils literal"><span class="pre">Operators</span></code> determine what
+actually gets done.</p>
+<p>An operator describes a single task in a workflow. Operators are usually (but
+not always) atomic, meaning they can stand on their own and don&#8217;t need to share
+resources with any other operators. The DAG will make sure that operators run in
+the correct certain order; other than those dependencies, operators generally
+run independently. In fact, they may run on two completely different machines.</p>
+<p>This is a subtle but very important point: in general, if two operators need to
+share information, like a filename or small amount of data, you should consider
+combining them into a single operator. If it absolutely can&#8217;t be avoided,
+Airflow does have a feature for operator cross-communication called XCom that is
+described elsewhere in this document.</p>
+<p>Airflow provides operators for many common tasks, including:</p>
+<ul class="simple">
+<li><code class="docutils literal"><span class="pre">BashOperator</span></code> - executes a bash command</li>
+<li><code class="docutils literal"><span class="pre">PythonOperator</span></code> - calls an arbitrary Python function</li>
+<li><code class="docutils literal"><span class="pre">EmailOperator</span></code> - sends an email</li>
+<li><code class="docutils literal"><span class="pre">HTTPOperator</span></code> - sends an HTTP request</li>
+<li><code class="docutils literal"><span class="pre">SqlOperator</span></code> - executes a SQL command</li>
+<li><code class="docutils literal"><span class="pre">Sensor</span></code> - waits for a certain time, file, database row, S3 key, etc...</li>
+</ul>
+<p>In addition to these basic building blocks, there are many more specific
+operators: <code class="docutils literal"><span class="pre">DockerOperator</span></code>, <code class="docutils literal"><span class="pre">HiveOperator</span></code>, <code class="docutils literal"><span class="pre">S3FileTransferOperator</span></code>,
+<code class="docutils literal"><span class="pre">PrestoToMysqlOperator</span></code>, <code class="docutils literal"><span class="pre">SlackOperator</span></code>... you get the idea!</p>
+<p>The <code class="docutils literal"><span class="pre">airflow/contrib/</span></code> directory contains yet more operators built by the
+community. These operators aren&#8217;t always as complete or well-tested as those in
+the main distribution, but allow users to more easily add new functionality to
+the platform.</p>
+<p>Operators are only loaded by Airflow if they are assigned to a DAG.</p>
+<div class="section" id="dag-assignment">
+<h4>DAG Assignment<a class="headerlink" href="#dag-assignment" title="Permalink to this headline">�</a></h4>
+<p><em>Added in Airflow 1.8</em></p>
+<p>Operators do not have to be assigned to DAGs immediately (previously <code class="docutils literal"><span class="pre">dag</span></code> was
+a required argument). However, once an operator is assigned to a DAG, it can not
+be transferred or unassigned. DAG assignment can be done explicitly when the
+operator is created, through deferred assignment, or even inferred from other
+operators.</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="n">dag</span> <span class="o">=</span> <span class="n">DAG</span><span class="p">(</span><span class="s1">&#39;my_dag&#39;</span><span class="p">,</span> <span class="n">start_date</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2016</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span>
+
+<span class="c1"># sets the DAG explicitly</span>
+<span class="n">explicit_op</span> <span class="o">=</span> <span class="n">DummyOperator</span><span class="p">(</span><span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;op1&#39;</span><span class="p">,</span> <span class="n">dag</span><span class="o">=</span><span class="n">dag</span><span class="p">)</span>
+
+<span class="c1"># deferred DAG assignment</span>
+<span class="n">deferred_op</span> <span class="o">=</span> <span class="n">DummyOperator</span><span class="p">(</span><span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;op2&#39;</span><span class="p">)</span>
+<span class="n">deferred_op</span><span class="o">.</span><span class="n">dag</span> <span class="o">=</span> <span class="n">dag</span>
+
+<span class="c1"># inferred DAG assignment (linked operators must be in the same DAG)</span>
+<span class="n">inferred_op</span> <span class="o">=</span> <span class="n">DummyOperator</span><span class="p">(</span><span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;op3&#39;</span><span class="p">)</span>
+<span class="n">inferred_op</span><span class="o">.</span><span class="n">set_upstream</span><span class="p">(</span><span class="n">deferred_op</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="section" id="bitshift-composition">
+<h4>Bitshift Composition<a class="headerlink" href="#bitshift-composition" title="Permalink to this headline">�</a></h4>
+<p><em>Added in Airflow 1.8</em></p>
+<p>Traditionally, operator relationships are set with the <code class="docutils literal"><span class="pre">set_upstream()</span></code> and
+<code class="docutils literal"><span class="pre">set_downstream()</span></code> methods. In Airflow 1.8, this can be done with the Python
+bitshift operators <code class="docutils literal"><span class="pre">&gt;&gt;</span></code> and <code class="docutils literal"><span class="pre">&lt;&lt;</span></code>. The following four statements are all
+functionally equivalent:</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="n">op1</span> <span class="o">&gt;&gt;</span> <span class="n">op2</span>
+<span class="n">op1</span><span class="o">.</span><span class="n">set_downstream</span><span class="p">(</span><span class="n">op2</span><span class="p">)</span>
+
+<span class="n">op2</span> <span class="o">&lt;&lt;</span> <span class="n">op1</span>
+<span class="n">op2</span><span class="o">.</span><span class="n">set_upstream</span><span class="p">(</span><span class="n">op1</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>When using the bitshift to compose operators, the relationship is set in the
+direction that the bitshift operator points. For example, <code class="docutils literal"><span class="pre">op1</span> <span class="pre">&gt;&gt;</span> <span class="pre">op2</span></code> means
+that <code class="docutils literal"><span class="pre">op1</span></code> runs first and <code class="docutils literal"><span class="pre">op2</span></code> runs second. Multiple operators can be
+composed &#8211; keep in mind the chain is executed left-to-right and the rightmost
+object is always returned. For example:</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="n">op1</span> <span class="o">&gt;&gt;</span> <span class="n">op2</span> <span class="o">&gt;&gt;</span> <span class="n">op3</span> <span class="o">&lt;&lt;</span> <span class="n">op4</span>
+</pre></div>
+</div>
+<p>is equivalent to:</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="n">op1</span><span class="o">.</span><span class="n">set_downstream</span><span class="p">(</span><span class="n">op2</span><span class="p">)</span>
+<span class="n">op2</span><span class="o">.</span><span class="n">set_downstream</span><span class="p">(</span><span class="n">op3</span><span class="p">)</span>
+<span class="n">op3</span><span class="o">.</span><span class="n">set_upstream</span><span class="p">(</span><span class="n">op4</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>For convenience, the bitshift operators can also be used with DAGs. For example:</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="n">dag</span> <span class="o">&gt;&gt;</span> <span class="n">op1</span> <span class="o">&gt;&gt;</span> <span class="n">op2</span>
+</pre></div>
+</div>
+<p>is equivalent to:</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="n">op1</span><span class="o">.</span><span class="n">dag</span> <span class="o">=</span> <span class="n">dag</span>
+<span class="n">op1</span><span class="o">.</span><span class="n">set_downstream</span><span class="p">(</span><span class="n">op2</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>We can put this all together to build a simple pipeline:</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="k">with</span> <span class="n">DAG</span><span class="p">(</span><span class="s1">&#39;my_dag&#39;</span><span class="p">,</span> <span class="n">start_date</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2016</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span> <span class="k">as</span> <span class="n">dag</span><span class="p">:</span>
+    <span class="p">(</span>
+        <span class="n">dag</span>
+        <span class="o">&gt;&gt;</span> <span class="n">DummyOperator</span><span class="p">(</span><span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;dummy_1&#39;</span><span class="p">)</span>
+        <span class="o">&gt;&gt;</span> <span class="n">BashOperator</span><span class="p">(</span>
+            <span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;bash_1&#39;</span><span class="p">,</span>
+            <span class="n">bash_command</span><span class="o">=</span><span class="s1">&#39;echo &quot;HELLO!&quot;&#39;</span><span class="p">)</span>
+        <span class="o">&gt;&gt;</span> <span class="n">PythonOperator</span><span class="p">(</span>
+            <span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;python_1&#39;</span><span class="p">,</span>
+            <span class="n">python_callable</span><span class="o">=</span><span class="k">lambda</span><span class="p">:</span> <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;GOODBYE!&quot;</span><span class="p">))</span>
+    <span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="section" id="tasks">
+<h3>Tasks<a class="headerlink" href="#tasks" title="Permalink to this headline">�</a></h3>
+<p>Once an operator is instantiated, it is referred to as a &#8220;task&#8221;. The
+instantiation defines specific values when calling the abstract operator, and
+the parameterized task becomes a node in a DAG.</p>
+</div>
+<div class="section" id="task-instances">
+<h3>Task Instances<a class="headerlink" href="#task-instances" title="Permalink to this headline">�</a></h3>
+<p>A task instance represents a specific run of a task and is characterized as the
+combination of a dag, a task, and a point in time. Task instances also have an
+indicative state, which could be &#8220;running&#8221;, &#8220;success&#8221;, &#8220;failed&#8221;, &#8220;skipped&#8221;, &#8220;up
+for retry&#8221;, etc.</p>
+</div>
+<div class="section" id="workflows">
+<h3>Workflows<a class="headerlink" href="#workflows" title="Permalink to this headline">�</a></h3>
+<p>You&#8217;re now familiar with the core building blocks of Airflow.
+Some of the concepts may sound very similar, but the vocabulary can
+be conceptualized like this:</p>
+<ul class="simple">
+<li>DAG: a description of the order in which work should take place</li>
+<li>Operator: a class that acts as a template for carrying out some work</li>
+<li>Task: a parameterized instance of an operator</li>
+<li>Task Instance: a task that 1) has been assigned to a DAG and 2) has a
+state associated with a specific run of the DAG</li>
+</ul>
+<p>By combining <code class="docutils literal"><span class="pre">DAGs</span></code> and <code class="docutils literal"><span class="pre">Operators</span></code> to create <code class="docutils literal"><span class="pre">TaskInstances</span></code>, you can
+build complex workflows.</p>
+</div>
+</div>
+<div class="section" id="additional-functionality">
+<h2>Additional Functionality<a class="headerlink" href="#additional-functionality" title="Permalink to this headline">�</a></h2>
+<p>In addition to the core Airflow objects, there are a number of more complex
+features that enable behaviors like limiting simultaneous access to resources,
+cross-communication, conditional execution, and more.</p>
+<div class="section" id="hooks">
+<h3>Hooks<a class="headerlink" href="#hooks" title="Permalink to this headline">�</a></h3>
+<p>Hooks are interfaces to external platforms and databases like Hive, S3,
+MySQL, Postgres, HDFS, and Pig. Hooks implement a common interface when
+possible, and act as a building block for operators. They also use
+the <code class="docutils literal"><span class="pre">airflow.models.Connection</span></code> model to retrieve hostnames
+and authentication information. Hooks keep authentication code and
+information out of pipelines, centralized in the metadata database.</p>
+<p>Hooks are also very useful on their own to use in Python scripts,
+Airflow airflow.operators.PythonOperator, and in interactive environments
+like iPython or Jupyter Notebook.</p>
+</div>
+<div class="section" id="pools">
+<h3>Pools<a class="headerlink" href="#pools" title="Permalink to this headline">�</a></h3>
+<p>Some systems can get overwhelmed when too many processes hit them at the same
+time. Airflow pools can be used to <strong>limit the execution parallelism</strong> on
+arbitrary sets of tasks. The list of pools is managed in the UI
+(<code class="docutils literal"><span class="pre">Menu</span> <span class="pre">-&gt;</span> <span class="pre">Admin</span> <span class="pre">-&gt;</span> <span class="pre">Pools</span></code>) by giving the pools a name and assigning
+it a number of worker slots. Tasks can then be associated with
+one of the existing pools by using the <code class="docutils literal"><span class="pre">pool</span></code> parameter when
+creating tasks (i.e., instantiating operators).</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="n">aggregate_db_message_job</span> <span class="o">=</span> <span class="n">BashOperator</span><span class="p">(</span>
+    <span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;aggregate_db_message_job&#39;</span><span class="p">,</span>
+    <span class="n">execution_timeout</span><span class="o">=</span><span class="n">timedelta</span><span class="p">(</span><span class="n">hours</span><span class="o">=</span><span class="mi">3</span><span class="p">),</span>
+    <span class="n">pool</span><span class="o">=</span><span class="s1">&#39;ep_data_pipeline_db_msg_agg&#39;</span><span class="p">,</span>
+    <span class="n">bash_command</span><span class="o">=</span><span class="n">aggregate_db_message_job_cmd</span><span class="p">,</span>
+    <span class="n">dag</span><span class="o">=</span><span class="n">dag</span><span class="p">)</span>
+<span class="n">aggregate_db_message_job</span><span class="o">.</span><span class="n">set_upstream</span><span class="p">(</span><span class="n">wait_for_empty_queue</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>The <code class="docutils literal"><span class="pre">pool</span></code> parameter can
+be used in conjunction with <code class="docutils literal"><span class="pre">priority_weight</span></code> to define priorities
+in the queue, and which tasks get executed first as slots open up in the
+pool. The default <code class="docutils literal"><span class="pre">priority_weight</span></code> is <code class="docutils literal"><span class="pre">1</span></code>, and can be bumped to any
+number. When sorting the queue to evaluate which task should be executed
+next, we use the <code class="docutils literal"><span class="pre">priority_weight</span></code>, summed up with all of the
+<code class="docutils literal"><span class="pre">priority_weight</span></code> values from tasks downstream from this task. You can
+use this to bump a specific important task and the whole path to that task
+gets prioritized accordingly.</p>
+<p>Tasks will be scheduled as usual while the slots fill up. Once capacity is
+reached, runnable tasks get queued and their state will show as such in the
+UI. As slots free up, queued tasks start running based on the
+<code class="docutils literal"><span class="pre">priority_weight</span></code> (of the task and its descendants).</p>
+<p>Note that by default tasks aren&#8217;t assigned to any pool and their
+execution parallelism is only limited to the executor&#8217;s setting.</p>
+</div>
+<div class="section" id="connections">
+<h3>Connections<a class="headerlink" href="#connections" title="Permalink to this headline">�</a></h3>
+<p>The connection information to external systems is stored in the Airflow
+metadata database and managed in the UI (<code class="docutils literal"><span class="pre">Menu</span> <span class="pre">-&gt;</span> <span class="pre">Admin</span> <span class="pre">-&gt;</span> <span class="pre">Connections</span></code>)
+A <code class="docutils literal"><span class="pre">conn_id</span></code> is defined there and hostname / login / password / schema
+information attached to it. Airflow pipelines can simply refer to the
+centrally managed <code class="docutils literal"><span class="pre">conn_id</span></code> without having to hard code any of this
+information anywhere.</p>
+<p>Many connections with the same <code class="docutils literal"><span class="pre">conn_id</span></code> can be defined and when that
+is the case, and when the <strong>hooks</strong> uses the <code class="docutils literal"><span class="pre">get_connection</span></code> method
+from <code class="docutils literal"><span class="pre">BaseHook</span></code>, Airflow will choose one connection randomly, allowing
+for some basic load balancing and fault tolerance when used in conjunction
+with retries.</p>
+<p>Airflow also has the ability to reference connections via environment
+variables from the operating system. The environment variable needs to be
+prefixed with <code class="docutils literal"><span class="pre">AIRFLOW_CONN_</span></code> to be considered a connection. When
+referencing the connection in the Airflow pipeline, the <code class="docutils literal"><span class="pre">conn_id</span></code> should
+be the name of the variable without the prefix. For example, if the <code class="docutils literal"><span class="pre">conn_id</span></code>
+is named <code class="docutils literal"><span class="pre">POSTGRES_MASTER</span></code> the environment variable should be named
+<code class="docutils literal"><span class="pre">AIRFLOW_CONN_POSTGRES_MASTER</span></code>. Airflow assumes the value returned
+from the environment variable to be in a URI format
+(e.g. <code class="docutils literal"><span class="pre">postgres://user:password&#64;localhost:5432/master</span></code>).</p>
+</div>
+<div class="section" id="queues">
+<h3>Queues<a class="headerlink" href="#queues" title="Permalink to this headline">�</a></h3>
+<p>When using the CeleryExecutor, the celery queues that tasks are sent to
+can be specified. <code class="docutils literal"><span class="pre">queue</span></code> is an attribute of BaseOperator, so any
+task can be assigned to any queue. The default queue for the environment
+is defined in the <code class="docutils literal"><span class="pre">airflow.cfg</span></code>&#8216;s <code class="docutils literal"><span class="pre">celery</span> <span class="pre">-&gt;</span> <span class="pre">default_queue</span></code>. This defines
+the queue that tasks get assigned to when not specified, as well as which
+queue Airflow workers listen to when started.</p>
+<p>Workers can listen to one or multiple queues of tasks. When a worker is
+started (using the command <code class="docutils literal"><span class="pre">airflow</span> <span class="pre">worker</span></code>), a set of comma delimited
+queue names can be specified (e.g. <code class="docutils literal"><span class="pre">airflow</span> <span class="pre">worker</span> <span class="pre">-q</span> <span class="pre">spark</span></code>). This worker
+will then only pick up tasks wired to the specified queue(s).</p>
+<p>This can be useful if you need specialized workers, either from a
+resource perspective (for say very lightweight tasks where one worker
+could take thousands of tasks without a problem), or from an environment
+perspective (you want a worker running from within the Spark cluster
+itself because it needs a very specific environment and security rights).</p>
+</div>
+<div class="section" id="xcoms">
+<h3>XComs<a class="headerlink" href="#xcoms" title="Permalink to this headline">�</a></h3>
+<p>XComs let tasks exchange messages, allowing more nuanced forms of control and
+shared state. The name is an abbreviation of &#8220;cross-communication&#8221;. XComs are
+principally defined by a key, value, and timestamp, but also track attributes
+like the task/DAG that created the XCom and when it should become visible. Any
+object that can be pickled can be used as an XCom value, so users should make
+sure to use objects of appropriate size.</p>
+<p>XComs can be &#8220;pushed&#8221; (sent) or &#8220;pulled&#8221; (received). When a task pushes an
+XCom, it makes it generally available to other tasks. Tasks can push XComs at
+any time by calling the <code class="docutils literal"><span class="pre">xcom_push()</span></code> method. In addition, if a task returns
+a value (either from its Operator&#8217;s <code class="docutils literal"><span class="pre">execute()</span></code> method, or from a
+PythonOperator&#8217;s <code class="docutils literal"><span class="pre">python_callable</span></code> function), then an XCom containing that
+value is automatically pushed.</p>
+<p>Tasks call <code class="docutils literal"><span class="pre">xcom_pull()</span></code> to retrieve XComs, optionally applying filters
+based on criteria like <code class="docutils literal"><span class="pre">key</span></code>, source <code class="docutils literal"><span class="pre">task_ids</span></code>, and source <code class="docutils literal"><span class="pre">dag_id</span></code>. By
+default, <code class="docutils literal"><span class="pre">xcom_pull()</span></code> filters for the keys that are automatically given to
+XComs when they are pushed by being returned from execute functions (as
+opposed to XComs that are pushed manually).</p>
+<p>If <code class="docutils literal"><span class="pre">xcom_pull</span></code> is passed a single string for <code class="docutils literal"><span class="pre">task_ids</span></code>, then the most
+recent XCom value from that task is returned; if a list of <code class="docutils literal"><span class="pre">task_ids</span></code> is
+passed, then a correpsonding list of XCom values is returned.</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="c1"># inside a PythonOperator called &#39;pushing_task&#39;</span>
+<span class="k">def</span> <span class="nf">push_function</span><span class="p">():</span>
+    <span class="k">return</span> <span class="n">value</span>
+
+<span class="c1"># inside another PythonOperator where provide_context=True</span>
+<span class="k">def</span> <span class="nf">pull_function</span><span class="p">(</span><span class="o">**</span><span class="n">context</span><span class="p">):</span>
+    <span class="n">value</span> <span class="o">=</span> <span class="n">context</span><span class="p">[</span><span class="s1">&#39;task_instance&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">xcom_pull</span><span class="p">(</span><span class="n">task_ids</span><span class="o">=</span><span class="s1">&#39;pushing_task&#39;</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>It is also possible to pull XCom directly in a template, here&#8217;s an example
+of what this may look like:</p>
+<div class="code sql highlight-default"><div class="highlight"><pre><span></span><span class="n">SELECT</span> <span class="o">*</span> <span class="n">FROM</span> <span class="p">{{</span> <span class="n">task_instance</span><span class="o">.</span><span class="n">xcom_pull</span><span class="p">(</span><span class="n">task_ids</span><span class="o">=</span><span class="s1">&#39;foo&#39;</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="s1">&#39;table_name&#39;</span><span class="p">)</span> <span class="p">}}</span>
+</pre></div>
+</div>
+<p>Note that XComs are similar to <a class="reference internal" href="#variables">Variables</a>, but are specifically designed
+for inter-task communication rather than global settings.</p>
+</div>
+<div class="section" id="variables">
+<h3>Variables<a class="headerlink" href="#variables" title="Permalink to this headline">�</a></h3>
+<p>Variables are a generic way to store and retrieve arbitrary content or
+settings as a simple key value store within Airflow. Variables can be
+listed, created, updated and deleted from the UI (<code class="docutils literal"><span class="pre">Admin</span> <span class="pre">-&gt;</span> <span class="pre">Variables</span></code>),
+code or CLI. While your pipeline code definition and most of your constants
+and variables should be defined in code and stored in source control,
+it can be useful to have some variables or configuration items
+accessible and modifiable through the UI.</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">airflow.models</span> <span class="k">import</span> <span class="n">Variable</span>
+<span class="n">foo</span> <span class="o">=</span> <span class="n">Variable</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;foo&quot;</span><span class="p">)</span>
+<span class="n">bar</span> <span class="o">=</span> <span class="n">Variable</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;bar&quot;</span><span class="p">,</span> <span class="n">deserialize_json</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>The second call assumes <code class="docutils literal"><span class="pre">json</span></code> content and will be deserialized into
+<code class="docutils literal"><span class="pre">bar</span></code>. Note that <code class="docutils literal"><span class="pre">Variable</span></code> is a sqlalchemy model and can be used
+as such.</p>
+</div>
+<div class="section" id="branching">
+<h3>Branching<a class="headerlink" href="#branching" title="Permalink to this headline">�</a></h3>
+<p>Sometimes you need a workflow to branch, or only go down a certain path
+based on an arbitrary condition which is typically related to something
+that happened in an upstream task. One way to do this is by using the
+<code class="docutils literal"><span class="pre">BranchPythonOperator</span></code>.</p>
+<p>The <code class="docutils literal"><span class="pre">BranchPythonOperator</span></code> is much like the PythonOperator except that it
+expects a python_callable that returns a task_id. The task_id returned
+is followed, and all of the other paths are skipped.
+The task_id returned by the Python function has to be referencing a task
+directly downstream from the BranchPythonOperator task.</p>
+<p>Note that using tasks with <code class="docutils literal"><span class="pre">depends_on_past=True</span></code> downstream from
+<code class="docutils literal"><span class="pre">BranchPythonOperator</span></code> is logically unsound as <code class="docutils literal"><span class="pre">skipped</span></code> status
+will invariably lead to block tasks that depend on their past successes.
+<code class="docutils literal"><span class="pre">skipped</span></code> states propagates where all directly upstream tasks are
+<code class="docutils literal"><span class="pre">skipped</span></code>.</p>
+<p>If you want to skip some tasks, keep in mind that you can&#8217;t have an empty
+path, if so make a dummy task.</p>
+<p>like this, the dummy task &#8220;branch_false&#8221; is skipped</p>
+<img alt="_images/branch_good.png" src="_images/branch_good.png" />
+<p>Not like this, where the join task is skipped</p>
+<img alt="_images/branch_bad.png" src="_images/branch_bad.png" />
+</div>
+<div class="section" id="subdags">
+<h3>SubDAGs<a class="headerlink" href="#subdags" title="Permalink to this headline">�</a></h3>
+<p>SubDAGs are perfect for repeating patterns. Defining a function that returns a
+DAG object is a nice design pattern when using Airflow.</p>
+<p>Airbnb uses the <em>stage-check-exchange</em> pattern when loading data. Data is staged
+in a temporary table, after which data quality checks are performed against
+that table. Once the checks all pass the partition is moved into the production
+table.</p>
+<p>As another example, consider the following DAG:</p>
+<img alt="_images/subdag_before.png" src="_images/subdag_before.png" />
+<p>We can combine all of the parallel <code class="docutils literal"><span class="pre">task-*</span></code> operators into a single SubDAG,
+so that the resulting DAG resembles the following:</p>
+<img alt="_images/subdag_after.png" src="_images/subdag_after.png" />
+<p>Note that SubDAG operators should contain a factory method that returns a DAG
+object. This will prevent the SubDAG from being treated like a separate DAG in
+the main UI. For example:</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="c1">#dags/subdag.py</span>
+<span class="kn">from</span> <span class="nn">airflow.models</span> <span class="k">import</span> <span class="n">DAG</span>
+<span class="kn">from</span> <span class="nn">airflow.operators</span> <span class="k">import</span> <span class="n">DummyOperator</span>
+
+
+<span class="c1"># Dag is returned by a factory method</span>
+<span class="k">def</span> <span class="nf">sub_dag</span><span class="p">(</span><span class="n">parent_dag_name</span><span class="p">,</span> <span class="n">child_dag_name</span><span class="p">,</span> <span class="n">start_date</span><span class="p">,</span> <span class="n">schedule_interval</span><span class="p">):</span>
+  <span class="n">dag</span> <span class="o">=</span> <span class="n">DAG</span><span class="p">(</span>
+    <span class="s1">&#39;</span><span class="si">%s</span><span class="s1">.</span><span class="si">%s</span><span class="s1">&#39;</span> <span class="o">%</span> <span class="p">(</span><span class="n">parent_dag_name</span><span class="p">,</span> <span class="n">child_dag_name</span><span class="p">),</span>
+    <span class="n">schedule_interval</span><span class="o">=</span><span class="n">schedule_interval</span><span class="p">,</span>
+    <span class="n">start_date</span><span class="o">=</span><span class="n">start_date</span><span class="p">,</span>
+  <span class="p">)</span>
+
+  <span class="n">dummy_operator</span> <span class="o">=</span> <span class="n">DummyOperator</span><span class="p">(</span>
+    <span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;dummy_task&#39;</span><span class="p">,</span>
+    <span class="n">dag</span><span class="o">=</span><span class="n">dag</span><span class="p">,</span>
+  <span class="p">)</span>
+
+  <span class="k">return</span> <span class="n">dag</span>
+</pre></div>
+</div>
+<p>This SubDAG can then be referenced in your main DAG file:</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="c1"># main_dag.py</span>
+<span class="kn">from</span> <span class="nn">datetime</span> <span class="k">import</span> <span class="n">datetime</span><span class="p">,</span> <span class="n">timedelta</span>
+<span class="kn">from</span> <span class="nn">airflow.models</span> <span class="k">import</span> <span class="n">DAG</span>
+<span class="kn">from</span> <span class="nn">airflow.operators</span> <span class="k">import</span> <span class="n">SubDagOperator</span>
+<span class="kn">from</span> <span class="nn">dags.subdag</span> <span class="k">import</span> <span class="n">sub_dag</span>
+
+
+<span class="n">PARENT_DAG_NAME</span> <span class="o">=</span> <span class="s1">&#39;parent_dag&#39;</span>
+<span class="n">CHILD_DAG_NAME</span> <span class="o">=</span> <span class="s1">&#39;child_dag&#39;</span>
+
+<span class="n">main_dag</span> <span class="o">=</span> <span class="n">DAG</span><span class="p">(</span>
+  <span class="n">dag_id</span><span class="o">=</span><span class="n">PARENT_DAG_NAME</span><span class="p">,</span>
+  <span class="n">schedule_interval</span><span class="o">=</span><span class="n">timedelta</span><span class="p">(</span><span class="n">hours</span><span class="o">=</span><span class="mi">1</span><span class="p">),</span>
+  <span class="n">start_date</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2016</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
+<span class="p">)</span>
+
+<span class="n">sub_dag</span> <span class="o">=</span> <span class="n">SubDagOperator</span><span class="p">(</span>
+  <span class="n">subdag</span><span class="o">=</span><span class="n">sub_dag</span><span class="p">(</span><span class="n">PARENT_DAG_NAME</span><span class="p">,</span> <span class="n">CHILD_DAG_NAME</span><span class="p">,</span> <span class="n">main_dag</span><span class="o">.</span><span class="n">start_date</span><span class="p">,</span>
+                 <span class="n">main_dag</span><span class="o">.</span><span class="n">schedule_interval</span><span class="p">),</span>
+  <span class="n">task_id</span><span class="o">=</span><span class="n">CHILD_DAG_NAME</span><span class="p">,</span>
+  <span class="n">dag</span><span class="o">=</span><span class="n">main_dag</span><span class="p">,</span>
+<span class="p">)</span>
+</pre></div>
+</div>
+<p>You can zoom into a SubDagOperator from the graph view of the main DAG to show
+the tasks contained within the SubDAG:</p>
+<img alt="_images/subdag_zoom.png" src="_images/subdag_zoom.png" />
+<p>Some other tips when using SubDAGs:</p>
+<ul class="simple">
+<li>by convention, a SubDAG&#8217;s <code class="docutils literal"><span class="pre">dag_id</span></code> should be prefixed by its parent and
+a dot. As in <code class="docutils literal"><span class="pre">parent.child</span></code></li>
+<li>share arguments between the main DAG and the SubDAG by passing arguments to
+the SubDAG operator (as demonstrated above)</li>
+<li>SubDAGs must have a schedule and be enabled. If the SubDAG&#8217;s schedule is
+set to <code class="docutils literal"><span class="pre">None</span></code> or <code class="docutils literal"><span class="pre">&#64;once</span></code>, the SubDAG will succeed without having done
+anything</li>
+<li>clearing a SubDagOperator also clears the state of the tasks within</li>
+<li>marking success on a SubDagOperator does not affect the state of the tasks
+within</li>
+<li>refrain from using <code class="docutils literal"><span class="pre">depends_on_past=True</span></code> in tasks within the SubDAG as
+this can be confusing</li>
+<li>it is possible to specify an executor for the SubDAG. It is common to use
+the SequentialExecutor if you want to run the SubDAG in-process and
+effectively limit its parallelism to one. Using LocalExecutor can be
+problematic as it may over-subscribe your worker, running multiple tasks in
+a single slot</li>
+</ul>
+<p>See <code class="docutils literal"><span class="pre">airflow/example_dags</span></code> for a demonstration.</p>
+</div>
+<div class="section" id="slas">
+<h3>SLAs<a class="headerlink" href="#slas" title="Permalink to this headline">�</a></h3>
+<p>Service Level Agreements, or time by which a task or DAG should have
+succeeded, can be set at a task level as a <code class="docutils literal"><span class="pre">timedelta</span></code>. If
+one or many instances have not succeeded by that time, an alert email is sent
+detailing the list of tasks that missed their SLA. The event is also recorded
+in the database and made available in the web UI under <code class="docutils literal"><span class="pre">Browse-&gt;Missed</span> <span class="pre">SLAs</span></code>
+where events can be analyzed and documented.</p>
+</div>
+<div class="section" id="trigger-rules">
+<h3>Trigger Rules<a class="headerlink" href="#trigger-rules" title="Permalink to this headline">�</a></h3>
+<p>Though the normal workflow behavior is to trigger tasks when all their
+directly upstream tasks have succeeded, Airflow allows for more complex
+dependency settings.</p>
+<p>All operators have a <code class="docutils literal"><span class="pre">trigger_rule</span></code> argument which defines the rule by which
+the generated task get triggered. The default value for <code class="docutils literal"><span class="pre">trigger_rule</span></code> is
+<code class="docutils literal"><span class="pre">all_success</span></code> and can be defined as &#8220;trigger this task when all directly
+upstream tasks have succeeded&#8221;. All other rules described here are based
+on direct parent tasks and are values that can be passed to any operator
+while creating tasks:</p>
+<ul class="simple">
+<li><code class="docutils literal"><span class="pre">all_success</span></code>: (default) all parents have succeeded</li>
+<li><code class="docutils literal"><span class="pre">all_failed</span></code>: all parents are in a <code class="docutils literal"><span class="pre">failed</span></code> or <code class="docutils literal"><span class="pre">upstream_failed</span></code> state</li>
+<li><code class="docutils literal"><span class="pre">all_done</span></code>: all parents are done with their execution</li>
+<li><code class="docutils literal"><span class="pre">one_failed</span></code>: fires as soon as at least one parent has failed, it does not wait for all parents to be done</li>
+<li><code class="docutils literal"><span class="pre">one_success</span></code>: fires as soon as at least one parent succeeds, it does not wait for all parents to be done</li>
+<li><code class="docutils literal"><span class="pre">dummy</span></code>: dependencies are just for show, trigger at will</li>
+</ul>
+<p>Note that these can be used in conjunction with <code class="docutils literal"><span class="pre">depends_on_past</span></code> (boolean)
+that, when set to <code class="docutils literal"><span class="pre">True</span></code>, keeps a task from getting triggered if the
+previous schedule for the task hasn&#8217;t succeeded.</p>
+</div>
+<div class="section" id="zombies-undeads">
+<h3>Zombies &amp; Undeads<a class="headerlink" href="#zombies-undeads" title="Permalink to this headline">�</a></h3>
+<p>Task instances die all the time, usually as part of their normal life cycle,
+but sometimes unexpectedly.</p>
+<p>Zombie tasks are characterized by the absence
+of an heartbeat (emitted by the job periodically) and a <code class="docutils literal"><span class="pre">running</span></code> status
+in the database. They can occur when a worker node can&#8217;t reach the database,
+when Airflow processes are killed externally, or when a node gets rebooted
+for instance. Zombie killing is performed periodically by the scheduler&#8217;s
+process.</p>
+<p>Undead processes are characterized by the existence of a process and a matching
+heartbeat, but Airflow isn&#8217;t aware of this task as <code class="docutils literal"><span class="pre">running</span></code> in the database.
+This mismatch typically occurs as the state of the database is altered,
+most likely by deleting rows in the &#8220;Task Instances&#8221; view in the UI.
+Tasks are instructed to verify their state as part of the heartbeat routine,
+and terminate themselves upon figuring out that they are in this &#8220;undead&#8221;
+state.</p>
+</div>
+<div class="section" id="cluster-policy">
+<h3>Cluster Policy<a class="headerlink" href="#cluster-policy" title="Permalink to this headline">�</a></h3>
+<p>Your local airflow settings file can define a <code class="docutils literal"><span class="pre">policy</span></code> function that
+has the ability to mutate task attributes based on other task or DAG
+attributes. It receives a single argument as a reference to task objects,
+and is expected to alter its attributes.</p>
+<p>For example, this function could apply a specific queue property when
+using a specific operator, or enforce a task timeout policy, making sure
+that no tasks run for more than 48 hours. Here&#8217;s an example of what this
+may look like inside your <code class="docutils literal"><span class="pre">airflow_settings.py</span></code>:</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">policy</span><span class="p">(</span><span class="n">task</span><span class="p">):</span>
+    <span class="k">if</span> <span class="n">task</span><span class="o">.</span><span class="n">__class__</span><span class="o">.</span><span class="n">__name__</span> <span class="o">==</span> <span class="s1">&#39;HivePartitionSensor&#39;</span><span class="p">:</span>
+        <span class="n">task</span><span class="o">.</span><span class="n">queue</span> <span class="o">=</span> <span class="s2">&quot;sensor_queue&quot;</span>
+    <span class="k">if</span> <span class="n">task</span><span class="o">.</span><span class="n">timeout</span> <span class="o">&gt;</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">hours</span><span class="o">=</span><span class="mi">48</span><span class="p">):</span>
+        <span class="n">task</span><span class="o">.</span><span class="n">timeout</span> <span class="o">=</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">hours</span><span class="o">=</span><span class="mi">48</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="section" id="documentation-notes">
+<h3>Documentation &amp; Notes<a class="headerlink" href="#documentation-notes" title="Permalink to this headline">�</a></h3>
+<p>It&#8217;s possible to add documentation or notes to your dags &amp; task objects that
+become visible in the web interface (&#8220;Graph View&#8221; for dags, &#8220;Task Details&#8221; for
+tasks). There are a set of special task attributes that get rendered as rich
+content if defined:</p>
+<table border="1" class="docutils">
+<colgroup>
+<col width="38%" />
+<col width="62%" />
+</colgroup>
+<thead valign="bottom">
+<tr class="row-odd"><th class="head">attribute</th>
+<th class="head">rendered to</th>
+</tr>
+</thead>
+<tbody valign="top">
+<tr class="row-even"><td>doc</td>
+<td>monospace</td>
+</tr>
+<tr class="row-odd"><td>doc_json</td>
+<td>json</td>
+</tr>
+<tr class="row-even"><td>doc_yaml</td>
+<td>yaml</td>
+</tr>
+<tr class="row-odd"><td>doc_md</td>
+<td>markdown</td>
+</tr>
+<tr class="row-even"><td>doc_rst</td>
+<td>reStructuredText</td>
+</tr>
+</tbody>
+</table>
+<p>Please note that for dags, dag_md is the only attribute interpreted.</p>
+<p>This is especially useful if your tasks are built dynamically from
+configuration files, it allows you to expose the configuration that led
+to the related tasks in Airflow.</p>
+<p>This content will get rendered as markdown respectively in the &#8220;Graph View&#8221; and
+&#8220;Task Details&#8221; pages.</p>
+</div>
+<div class="section" id="jinja-templating">
+<h3>Jinja Templating<a class="headerlink" href="#jinja-templating" title="Permalink to this headline">�</a></h3>
+<p>Airflow leverages the power of
+<a class="reference external" href="http://jinja.pocoo.org/docs/dev/">Jinja Templating</a> and this can be a
+powerful tool to use in combination with macros (see the <a class="reference internal" href="code.html#macros"><span class="std std-ref">Macros</span></a> section).</p>
+<p>For example, say you want to pass the execution date as an environment variable
+to a Bash script using the <code class="docutils literal"><span class="pre">BashOperator</span></code>.</p>
+<div class="code python highlight-default"><div class="highlight"><pre><span></span><span class="c1"># The execution date as YYYY-MM-DD</span>
+<span class="n">date</span> <span class="o">=</span> <span class="s2">&quot;{{ ds }}&quot;</span>
+<span class="n">t</span> <span class="o">=</span> <span class="n">BashOperator</span><span class="p">(</span>
+    <span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;test_env&#39;</span><span class="p">,</span>
+    <span class="n">bash_command</span><span class="o">=</span><span class="s1">&#39;/tmp/test.sh &#39;</span><span class="p">,</span>
+    <span class="n">dag</span><span class="o">=</span><span class="n">dag</span><span class="p">,</span>
+    <span class="n">env</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;EXECUTION_DATE&#39;</span><span class="p">:</span> <span class="n">date</span><span class="p">})</span>
+</pre></div>
+</div>
+<p>Here, <code class="docutils literal"><span class="pre">{{</span> <span class="pre">ds</span> <span class="pre">}}</span></code> is a macro, and because the <code class="docutils literal"><span class="pre">env</span></code> parameter of the
+<code class="docutils literal"><span class="pre">BashOperator</span></code> is templated with Jinja, the execution date will be available
+as an environment variable named <code class="docutils literal"><span class="pre">EXECUTION_DATE</span></code> in your Bash script.</p>
+<p>You can use Jinja templating with every parameter that is marked as &#8220;templated&#8221;
+in the documentation.</p>
+</div>
+</div>
+<div class="section" id="packaged-dags">
+<h2>Packaged dags<a class="headerlink" href="#packaged-dags" title="Permalink to this headline">�</a></h2>
+<p>While often you will specify dags in a single <code class="docutils literal"><span class="pre">.py</span></code> file it might sometimes
+be required to combine dag and its dependencies. For example, you might want
+to combine several dags together to version them together or you might want
+to manage them together or you might need an extra module that is not available
+by default on the system you are running airflow on. To allow this you can create
+a zip file that contains the dag(s) in the root of the zip file and have the extra
+modules unpacked in directories.</p>
+<p>For instance you can create a zip file that looks like this:</p>
+<div class="highlight-bash"><div class="highlight"><pre><span></span>my_dag1.py
+my_dag2.py
+package1/__init__.py
+package1/functions.py
+</pre></div>
+</div>
+<p>Airflow will scan the zip file and try to load <code class="docutils literal"><span class="pre">my_dag1.py</span></code> and <code class="docutils literal"><span class="pre">my_dag2.py</span></code>.
+It will not go into subdirectories as these are considered to be potential
+packages.</p>
+<p>In case you would like to add module dependencies to your DAG you basically would
+do the same, but then it is more to use a virtualenv and pip.</p>
+<div class="highlight-bash"><div class="highlight"><pre><span></span>virtualenv zip_dag
+<span class="nb">source</span> zip_dag/bin/activate
+
+mkdir zip_dag_contents
+<span class="nb">cd</span> zip_dag_contents
+
+pip install --install-option<span class="o">=</span><span class="s2">&quot;--install-lib=</span><span class="nv">$PWD</span><span class="s2">&quot;</span> my_useful_package
+cp ~/my_dag.py .
+
+zip -r zip_dag.zip *
+</pre></div>
+</div>
+<div class="admonition note">
+<p class="first admonition-title">Note</p>
+<p class="last">the zip file will be inserted at the beginning of module search list
+(sys.path) and as such it will be available to any other code that resides
+within the same interpreter.</p>
+</div>
+<div class="admonition note">
+<p class="first admonition-title">Note</p>
+<p class="last">packaged dags cannot be used with pickling turned on.</p>
+</div>
+<div class="admonition note">
+<p class="first admonition-title">Note</p>
+<p class="last">packaged dags cannot contain dynamic libraries (eg. libz.so) these need
+to be available on the system if a module needs those. In other words only
+pure python modules can be packaged.</p>
+</div>
+</div>
+</div>
+
+
+           </div>
+          </div>
+          <footer>
+  
+    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
+      
+        <a href="profiling.html" class="btn btn-neutral float-right" title="Data Profiling" accesskey="n">Next <span class="fa fa-arrow-circle-right"></span></a>
+      
+      
+        <a href="ui.html" class="btn btn-neutral" title="UI / Screenshots" accesskey="p"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+      
+    </div>
+  
+
+  <hr/>
+
+  <div role="contentinfo">
+    <p>
+        &copy; Copyright 2014, Maxime Beauchemin, Airbnb.
+
+    </p>
+  </div>
+  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. 
+
+</footer>
+
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+  
+
+
+  
+
+    <script type="text/javascript">
+        var DOCUMENTATION_OPTIONS = {
+            URL_ROOT:'./',
+            VERSION:'',
+            COLLAPSE_INDEX:false,
+            FILE_SUFFIX:'.html',
+            HAS_SOURCE:  true
+        };
+    </script>
+      <script type="text/javascript" src="_static/jquery.js"></script>
+      <script type="text/javascript" src="_static/underscore.js"></script>
+      <script type="text/javascript" src="_static/doctools.js"></script>
+
+  
+
+  
+  
+    <script type="text/javascript" src="_static/js/theme.js"></script>
+  
+
+  
+  
+  <script type="text/javascript">
+      jQuery(function () {
+          SphinxRtdTheme.StickyNav.enable();
+      });
+  </script>
+   
+
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-airflow-site/blob/9e19165c/configuration.html
----------------------------------------------------------------------
diff --git a/configuration.html b/configuration.html
new file mode 100644
index 0000000..3859b34
--- /dev/null
+++ b/configuration.html
@@ -0,0 +1,419 @@
+
+
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  
+  <title>Configuration &mdash; Airflow Documentation</title>
+  
+
+  
+  
+
+  
+
+  
+  
+    
+
+  
+
+  
+  
+    <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+  
+
+  
+
+  
+    <link rel="top" title="Airflow Documentation" href="index.html"/>
+        <link rel="next" title="UI / Screenshots" href="ui.html"/>
+        <link rel="prev" title="Tutorial" href="tutorial.html"/> 
+
+  
+  <script src="_static/js/modernizr.min.js"></script>
+
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+
+    
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
+      <div class="wy-side-scroll">
+        <div class="wy-side-nav-search">
+          
+
+          
+            <a href="index.html" class="icon icon-home"> Airflow
+          
+
+          
+          </a>
+
+          
+            
+            
+          
+
+          
+<div role="search">
+  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+    <input type="hidden" name="check_keywords" value="yes" />
+    <input type="hidden" name="area" value="default" />
+  </form>
+</div>
+
+          
+        </div>
+
+        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
+          
+            
+            
+                <ul class="current">
+<li class="toctree-l1"><a class="reference internal" href="project.html">Project</a></li>
+<li class="toctree-l1"><a class="reference internal" href="license.html">License</a></li>
+<li class="toctree-l1"><a class="reference internal" href="start.html">Quick Start</a></li>
+<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="tutorial.html">Tutorial</a></li>
+<li class="toctree-l1 current"><a class="current reference internal" href="#">Configuration</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="#setting-configuration-options">Setting Configuration Options</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#setting-up-a-backend">Setting up a Backend</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#connections">Connections</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#scaling-out-with-celery">Scaling Out with Celery</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#logs">Logs</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#scaling-out-on-mesos-community-contributed">Scaling Out on Mesos (community contributed)</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#integration-with-systemd">Integration with systemd</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#integration-with-upstart">Integration with upstart</a></li>
+</ul>
+</li>
+<li class="toctree-l1"><a class="reference internal" href="ui.html">UI / Screenshots</a></li>
+<li class="toctree-l1"><a class="reference internal" href="concepts.html">Concepts</a></li>
+<li class="toctree-l1"><a class="reference internal" href="profiling.html">Data Profiling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="cli.html">Command Line Interface</a></li>
+<li class="toctree-l1"><a class="reference internal" href="scheduler.html">Scheduling &amp; Triggers</a></li>
+<li class="toctree-l1"><a class="reference internal" href="plugins.html">Plugins</a></li>
+<li class="toctree-l1"><a class="reference internal" href="security.html">Security</a></li>
+<li class="toctree-l1"><a class="reference internal" href="faq.html">FAQ</a></li>
+<li class="toctree-l1"><a class="reference internal" href="code.html">API Reference</a></li>
+</ul>
+
+            
+          
+        </div>
+      </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+      
+      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
+        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+        <a href="index.html">Airflow</a>
+      </nav>
+
+
+      
+      <div class="wy-nav-content">
+        <div class="rst-content">
+          
+
+ 
+
+
+
+<div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="index.html">Docs</a> &raquo;</li>
+      
+    <li>Configuration</li>
+      <li class="wy-breadcrumbs-aside">
+        
+          
+            <a href="_sources/configuration.txt" rel="nofollow"> View page source</a>
+          
+        
+      </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+            
+  <div class="section" id="configuration">
+<h1>Configuration<a class="headerlink" href="#configuration" title="Permalink to this headline">�</a></h1>
+<p>Setting up the sandbox in the <a class="reference internal" href="start.html"><span class="doc">Quick Start</span></a> section was easy;
+building a production-grade environment requires a bit more work!</p>
+<div class="section" id="setting-configuration-options">
+<h2>Setting Configuration Options<a class="headerlink" href="#setting-configuration-options" title="Permalink to this headline">�</a></h2>
+<p>The first time you run Airflow, it will create a file called <code class="docutils literal"><span class="pre">airflow.cfg</span></code> in
+your <code class="docutils literal"><span class="pre">$AIRFLOW_HOME</span></code> directory (<code class="docutils literal"><span class="pre">~/airflow</span></code> by default). This file contains Airflow&#8217;s configuration and you
+can edit it to change any of the settings. You can also set options with environment variables by using this format:
+<code class="docutils literal"><span class="pre">$AIRFLOW__{SECTION}__{KEY}</span></code> (note the double underscores).</p>
+<p>For example, the
+metadata database connection string can either be set in <code class="docutils literal"><span class="pre">airflow.cfg</span></code> like this:</p>
+<div class="highlight-bash"><div class="highlight"><pre><span></span><span class="o">[</span>core<span class="o">]</span>
+<span class="nv">sql_alchemy_conn</span> <span class="o">=</span> my_conn_string
+</pre></div>
+</div>
+<p>or by creating a corresponding environment variable:</p>
+<div class="highlight-bash"><div class="highlight"><pre><span></span><span class="nv">AIRFLOW__CORE__SQL_ALCHEMY_CONN</span><span class="o">=</span>my_conn_string
+</pre></div>
+</div>
+<p>You can also derive the connection string at run time by appending <code class="docutils literal"><span class="pre">_cmd</span></code> to the key like this:</p>
+<div class="highlight-bash"><div class="highlight"><pre><span></span><span class="o">[</span>core<span class="o">]</span>
+<span class="nv">sql_alchemy_conn_cmd</span> <span class="o">=</span> bash_command_to_run
+</pre></div>
+</div>
+<p>But only three such configuration elements namely sql_alchemy_conn, broker_url and celery_result_backend can be fetched as a command. The idea behind this is to not store passwords on boxes in plain text files. The order of precedence is as follows -</p>
+<ol class="arabic simple">
+<li>environment variable</li>
+<li>configuration in airflow.cfg</li>
+<li>command in airflow.cfg</li>
+<li>default</li>
+</ol>
+</div>
+<div class="section" id="setting-up-a-backend">
+<h2>Setting up a Backend<a class="headerlink" href="#setting-up-a-backend" title="Permalink to this headline">�</a></h2>
+<p>If you want to take a real test drive of Airflow, you should consider
+setting up a real database backend and switching to the LocalExecutor.</p>
+<p>As Airflow was built to interact with its metadata using the great SqlAlchemy
+library, you should be able to use any database backend supported as a
+SqlAlchemy backend. We recommend using <strong>MySQL</strong> or <strong>Postgres</strong>.</p>
+<div class="admonition note">
+<p class="first admonition-title">Note</p>
+<p class="last">If you decide to use <strong>Postgres</strong>, we recommend using the <code class="docutils literal"><span class="pre">psycopg2</span></code>
+driver and specifying it in your SqlAlchemy connection string.
+Also note that since SqlAlchemy does not expose a way to target a
+specific schema in the Postgres connection URI, you may
+want to set a default schema for your role with a
+command similar to <code class="docutils literal"><span class="pre">ALTER</span> <span class="pre">ROLE</span> <span class="pre">username</span> <span class="pre">SET</span> <span class="pre">search_path</span> <span class="pre">=</span> <span class="pre">airflow,</span> <span class="pre">foobar;</span></code></p>
+</div>
+<p>Once you&#8217;ve setup your database to host Airflow, you&#8217;ll need to alter the
+SqlAlchemy connection string located in your configuration file
+<code class="docutils literal"><span class="pre">$AIRFLOW_HOME/airflow.cfg</span></code>. You should then also change the &#8220;executor&#8221;
+setting to use &#8220;LocalExecutor&#8221;, an executor that can parallelize task
+instances locally.</p>
+<div class="highlight-bash"><div class="highlight"><pre><span></span><span class="c1"># initialize the database</span>
+airflow initdb
+</pre></div>
+</div>
+</div>
+<div class="section" id="connections">
+<h2>Connections<a class="headerlink" href="#connections" title="Permalink to this headline">�</a></h2>
+<p>Airflow needs to know how to connect to your environment. Information
+such as hostname, port, login and passwords to other systems and services is
+handled in the <code class="docutils literal"><span class="pre">Admin-&gt;Connection</span></code> section of the UI. The pipeline code you
+will author will reference the &#8216;conn_id&#8217; of the Connection objects.</p>
+<img alt="_images/connections.png" src="_images/connections.png" />
+<p>By default, Airflow will save the passwords for the connection in plain text
+within the metadata database. The <code class="docutils literal"><span class="pre">crypto</span></code> package is highly recommended
+during installation. The <code class="docutils literal"><span class="pre">crypto</span></code> package does require that your operating
+system have libffi-dev installed.</p>
+<p>Connections in Airflow pipelines can be created using environment variables.
+The environment variable needs to have a prefix of <code class="docutils literal"><span class="pre">AIRFLOW_CONN_</span></code> for
+Airflow with the value in a URI format to use the connection properly. Please
+see the <a class="reference internal" href="concepts.html"><span class="doc">Concepts</span></a> documentation for more information on environment
+variables and connections.</p>
+</div>
+<div class="section" id="scaling-out-with-celery">
+<h2>Scaling Out with Celery<a class="headerlink" href="#scaling-out-with-celery" title="Permalink to this headline">�</a></h2>
+<p><code class="docutils literal"><span class="pre">CeleryExecutor</span></code> is one of the ways you can scale out the number of workers. For this
+to work, you need to setup a Celery backend (<strong>RabbitMQ</strong>, <strong>Redis</strong>, ...) and
+change your <code class="docutils literal"><span class="pre">airflow.cfg</span></code> to point the executor parameter to
+<code class="docutils literal"><span class="pre">CeleryExecutor</span></code> and provide the related Celery settings.</p>
+<p>For more information about setting up a Celery broker, refer to the
+exhaustive <a class="reference external" href="http://docs.celeryproject.org/en/latest/getting-started/brokers/index.html">Celery documentation on the topic</a>.</p>
+<p>Here are a few imperative requirements for your workers:</p>
+<ul class="simple">
+<li><code class="docutils literal"><span class="pre">airflow</span></code> needs to be installed, and the CLI needs to be in the path</li>
+<li>Airflow configuration settings should be homogeneous across the cluster</li>
+<li>Operators that are executed on the worker need to have their dependencies
+met in that context. For example, if you use the <code class="docutils literal"><span class="pre">HiveOperator</span></code>,
+the hive CLI needs to be installed on that box, or if you use the
+<code class="docutils literal"><span class="pre">MySqlOperator</span></code>, the required Python library needs to be available in
+the <code class="docutils literal"><span class="pre">PYTHONPATH</span></code> somehow</li>
+<li>The worker needs to have access to its <code class="docutils literal"><span class="pre">DAGS_FOLDER</span></code>, and you need to
+synchronize the filesystems by your own means. A common setup would be to
+store your DAGS_FOLDER in a Git repository and sync it across machines using
+Chef, Puppet, Ansible, or whatever you use to configure machines in your
+environment. If all your boxes have a common mount point, having your
+pipelines files shared there should work as well</li>
+</ul>
+<p>To kick off a worker, you need to setup Airflow and kick off the worker
+subcommand</p>
+<div class="highlight-bash"><div class="highlight"><pre><span></span>airflow worker
+</pre></div>
+</div>
+<p>Your worker should start picking up tasks as soon as they get fired in
+its direction.</p>
+<p>Note that you can also run &#8220;Celery Flower&#8221;, a web UI built on top of Celery,
+to monitor your workers. You can use the shortcut command <code class="docutils literal"><span class="pre">airflow</span> <span class="pre">flower</span></code>
+to start a Flower web server.</p>
+</div>
+<div class="section" id="logs">
+<h2>Logs<a class="headerlink" href="#logs" title="Permalink to this headline">�</a></h2>
+<p>Users can specify a logs folder in <code class="docutils literal"><span class="pre">airflow.cfg</span></code>. By default, it is in
+the <code class="docutils literal"><span class="pre">AIRFLOW_HOME</span></code> directory.</p>
+<p>In addition, users can supply a remote location for storing logs and log backups
+in cloud storage. At this time, Amazon S3 and Google Cloud Storage are supported.
+To enable this feature, <code class="docutils literal"><span class="pre">airflow.cfg</span></code> must be configured as in this example:</p>
+<div class="highlight-bash"><div class="highlight"><pre><span></span><span class="o">[</span>core<span class="o">]</span>
+<span class="c1"># Airflow can store logs remotely in AWS S3 or Google Cloud Storage. Users</span>
+<span class="c1"># must supply a remote location URL (starting with either &#39;s3://...&#39; or</span>
+<span class="c1"># &#39;gs://...&#39;) and an Airflow connection id that provides access to the storage</span>
+<span class="c1"># location.</span>
+<span class="nv">remote_base_log_folder</span> <span class="o">=</span> s3://my-bucket/path/to/logs
+<span class="nv">remote_log_conn_id</span> <span class="o">=</span> MyS3Conn
+<span class="c1"># Use server-side encryption for logs stored in S3</span>
+<span class="nv">encrypt_s3_logs</span> <span class="o">=</span> False
+</pre></div>
+</div>
+<p>Remote logging uses an existing Airflow connection to read/write logs. If you don&#8217;t
+have a connection properly setup, this will fail. In the above example, Airflow will
+try to use <code class="docutils literal"><span class="pre">S3Hook('MyS3Conn')</span></code>.</p>
+<p>In the Airflow Web UI, local logs take precedance over remote logs. If local logs
+can not be found or accessed, the remote logs will be displayed. Note that logs
+are only sent to remote storage once a task completes (including failure). In other
+words, remote logs for running tasks are unavailable.</p>
+</div>
+<div class="section" id="scaling-out-on-mesos-community-contributed">
+<h2>Scaling Out on Mesos (community contributed)<a class="headerlink" href="#scaling-out-on-mesos-community-contributed" title="Permalink to this headline">�</a></h2>
+<p><code class="docutils literal"><span class="pre">MesosExecutor</span></code> allows you to schedule airflow tasks on a Mesos cluster.
+For this to work, you need a running mesos cluster and you must perform the following
+steps -</p>
+<ol class="arabic simple">
+<li>Install airflow on a machine where web server and scheduler will run,
+let&#8217;s refer to this as the &#8220;Airflow server&#8221;.</li>
+<li>On the Airflow server, install mesos python eggs from <a class="reference external" href="http://open.mesosphere.com/downloads/mesos/">mesos downloads</a>.</li>
+<li>On the Airflow server, use a database (such as mysql) which can be accessed from mesos
+slave machines and add configuration in <code class="docutils literal"><span class="pre">airflow.cfg</span></code>.</li>
+<li>Change your <code class="docutils literal"><span class="pre">airflow.cfg</span></code> to point executor parameter to
+<cite>MesosExecutor</cite> and provide related Mesos settings.</li>
+<li>On all mesos slaves, install airflow. Copy the <code class="docutils literal"><span class="pre">airflow.cfg</span></code> from
+Airflow server (so that it uses same sql alchemy connection).</li>
+<li>On all mesos slaves, run the following for serving logs:</li>
+</ol>
+<div class="highlight-bash"><div class="highlight"><pre><span></span>airflow serve_logs
+</pre></div>
+</div>
+<ol class="arabic simple" start="7">
+<li>On Airflow server, to start processing/scheduling DAGs on mesos, run:</li>
+</ol>
+<div class="highlight-bash"><div class="highlight"><pre><span></span>airflow scheduler -p
+</pre></div>
+</div>
+<p>Note: We need -p parameter to pickle the DAGs.</p>
+<p>You can now see the airflow framework and corresponding tasks in mesos UI.
+The logs for airflow tasks can be seen in airflow UI as usual.</p>
+<p>For more information about mesos, refer to <a class="reference external" href="http://mesos.apache.org/documentation/latest/">mesos documentation</a>.
+For any queries/bugs on <cite>MesosExecutor</cite>, please contact <a class="reference external" href="https://github.com/kapil-malik">&#64;kapil-malik</a>.</p>
+</div>
+<div class="section" id="integration-with-systemd">
+<h2>Integration with systemd<a class="headerlink" href="#integration-with-systemd" title="Permalink to this headline">�</a></h2>
+<p>Airflow can integrate with systemd based systems. This makes watching your
+daemons easy as systemd can take care of restarting a daemon on failure.
+In the <code class="docutils literal"><span class="pre">scripts/systemd</span></code> directory you can find unit files that
+have been tested on Redhat based systems. You can copy those to
+<code class="docutils literal"><span class="pre">/usr/lib/systemd/system</span></code>. It is assumed that Airflow will run under
+<code class="docutils literal"><span class="pre">airflow:airflow</span></code>. If not (or if you are running on a non Redhat
+based system) you probably need to adjust the unit files.</p>
+<p>Environment configuration is picked up from <code class="docutils literal"><span class="pre">/etc/sysconfig/airflow</span></code>.
+An example file is supplied. Make sure to specify the <code class="docutils literal"><span class="pre">SCHEDULER_RUNS</span></code>
+variable in this file when you run the scheduler. You
+can also define here, for example, <code class="docutils literal"><span class="pre">AIRFLOW_HOME</span></code> or <code class="docutils literal"><span class="pre">AIRFLOW_CONFIG</span></code>.</p>
+</div>
+<div class="section" id="integration-with-upstart">
+<h2>Integration with upstart<a class="headerlink" href="#integration-with-upstart" title="Permalink to this headline">�</a></h2>
+<p>Airflow can integrate with upstart based systems. Upstart automatically starts all airflow services for which you
+have a corresponding <code class="docutils literal"><span class="pre">*.conf</span></code> file in <code class="docutils literal"><span class="pre">/etc/init</span></code> upon system boot. On failure, upstart automatically restarts
+the process (until it reaches re-spawn limit set in a <code class="docutils literal"><span class="pre">*.conf</span></code> file).</p>
+<p>You can find sample upstart job files in the <code class="docutils literal"><span class="pre">scripts/upstart</span></code> directory. These files have been tested on
+Ubuntu 14.04 LTS. You may have to adjust <code class="docutils literal"><span class="pre">start</span> <span class="pre">on</span></code> and <code class="docutils literal"><span class="pre">stop</span> <span class="pre">on</span></code> stanzas to make it work on other upstart
+systems. Some of the possible options are listed in <code class="docutils literal"><span class="pre">scripts/upstart/README</span></code>.</p>
+<p>Modify <code class="docutils literal"><span class="pre">*.conf</span></code> files as needed and copy to <code class="docutils literal"><span class="pre">/etc/init</span></code> directory. It is assumed that airflow will run
+under <code class="docutils literal"><span class="pre">airflow:airflow</span></code>. Change <code class="docutils literal"><span class="pre">setuid</span></code> and <code class="docutils literal"><span class="pre">setgid</span></code> in <code class="docutils literal"><span class="pre">*.conf</span></code> files if you use other user/group</p>
+<p>You can use <code class="docutils literal"><span class="pre">initctl</span></code> to manually start, stop, view status of the airflow process that has been
+integrated with upstart</p>
+<div class="highlight-bash"><div class="highlight"><pre><span></span>initctl airflow-webserver status
+</pre></div>
+</div>
+</div>
+</div>
+
+
+           </div>
+          </div>
+          <footer>
+  
+    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
+      
+        <a href="ui.html" class="btn btn-neutral float-right" title="UI / Screenshots" accesskey="n">Next <span class="fa fa-arrow-circle-right"></span></a>
+      
+      
+        <a href="tutorial.html" class="btn btn-neutral" title="Tutorial" accesskey="p"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+      
+    </div>
+  
+
+  <hr/>
+
+  <div role="contentinfo">
+    <p>
+        &copy; Copyright 2014, Maxime Beauchemin, Airbnb.
+
+    </p>
+  </div>
+  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. 
+
+</footer>
+
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+  
+
+
+  
+
+    <script type="text/javascript">
+        var DOCUMENTATION_OPTIONS = {
+            URL_ROOT:'./',
+            VERSION:'',
+            COLLAPSE_INDEX:false,
+            FILE_SUFFIX:'.html',
+            HAS_SOURCE:  true
+        };
+    </script>
+      <script type="text/javascript" src="_static/jquery.js"></script>
+      <script type="text/javascript" src="_static/underscore.js"></script>
+      <script type="text/javascript" src="_static/doctools.js"></script>
+
+  
+
+  
+  
+    <script type="text/javascript" src="_static/js/theme.js"></script>
+  
+
+  
+  
+  <script type="text/javascript">
+      jQuery(function () {
+          SphinxRtdTheme.StickyNav.enable();
+      });
+  </script>
+   
+
+</body>
+</html>
\ No newline at end of file