You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by ma...@apache.org on 2016/06/05 05:24:11 UTC
[21/34] incubator-airflow-site git commit: Initial commit
http://git-wip-us.apache.org/repos/asf/incubator-airflow-site/blob/9e19165c/_modules/presto_hook.html
----------------------------------------------------------------------
diff --git a/_modules/presto_hook.html b/_modules/presto_hook.html
new file mode 100644
index 0000000..3368d4c
--- /dev/null
+++ b/_modules/presto_hook.html
@@ -0,0 +1,298 @@
+
+
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+ <meta charset="utf-8">
+
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+ <title>presto_hook — Airflow Documentation</title>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
+
+
+
+
+
+ <link rel="top" title="Airflow Documentation" href="../index.html"/>
+ <link rel="up" title="Module code" href="index.html"/>
+
+
+ <script src="../_static/js/modernizr.min.js"></script>
+
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+ <div class="wy-grid-for-nav">
+
+
+ <nav data-toggle="wy-nav-shift" class="wy-nav-side">
+ <div class="wy-side-scroll">
+ <div class="wy-side-nav-search">
+
+
+
+ <a href="../index.html" class="icon icon-home"> Airflow
+
+
+
+ </a>
+
+
+
+
+
+
+
+<div role="search">
+ <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
+ <input type="text" name="q" placeholder="Search docs" />
+ <input type="hidden" name="check_keywords" value="yes" />
+ <input type="hidden" name="area" value="default" />
+ </form>
+</div>
+
+
+ </div>
+
+ <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
+
+
+
+ <ul>
+<li class="toctree-l1"><a class="reference internal" href="../project.html">Project</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../license.html">License</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../start.html">Quick Start</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../tutorial.html">Tutorial</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../configuration.html">Configuration</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../ui.html">UI / Screenshots</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../concepts.html">Concepts</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../profiling.html">Data Profiling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../cli.html">Command Line Interface</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../scheduler.html">Scheduling & Triggers</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../plugins.html">Plugins</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../security.html">Security</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../code.html">API Reference</a></li>
+</ul>
+
+
+
+ </div>
+ </div>
+ </nav>
+
+ <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+
+ <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
+ <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+ <a href="../index.html">Airflow</a>
+ </nav>
+
+
+
+ <div class="wy-nav-content">
+ <div class="rst-content">
+
+
+
+
+
+
+<div role="navigation" aria-label="breadcrumbs navigation">
+ <ul class="wy-breadcrumbs">
+ <li><a href="../index.html">Docs</a> »</li>
+
+ <li><a href="index.html">Module code</a> »</li>
+
+ <li>presto_hook</li>
+ <li class="wy-breadcrumbs-aside">
+
+
+
+ </li>
+ </ul>
+ <hr/>
+</div>
+ <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+ <div itemprop="articleBody">
+
+ <h1>Source code for presto_hook</h1><div class="highlight"><pre>
+<span></span><span class="kn">from</span> <span class="nn">builtins</span> <span class="kn">import</span> <span class="nb">str</span>
+<span class="kn">from</span> <span class="nn">pyhive</span> <span class="kn">import</span> <span class="n">presto</span>
+<span class="kn">from</span> <span class="nn">pyhive.exc</span> <span class="kn">import</span> <span class="n">DatabaseError</span>
+
+<span class="kn">from</span> <span class="nn">airflow.hooks.dbapi_hook</span> <span class="kn">import</span> <span class="n">DbApiHook</span>
+
+<span class="kn">import</span> <span class="nn">logging</span>
+<span class="n">logging</span><span class="o">.</span><span class="n">getLogger</span><span class="p">(</span><span class="s2">"pyhive"</span><span class="p">)</span><span class="o">.</span><span class="n">setLevel</span><span class="p">(</span><span class="n">logging</span><span class="o">.</span><span class="n">INFO</span><span class="p">)</span>
+
+
+<span class="k">class</span> <span class="nc">PrestoException</span><span class="p">(</span><span class="ne">Exception</span><span class="p">):</span>
+ <span class="k">pass</span>
+
+
+<div class="viewcode-block" id="PrestoHook"><a class="viewcode-back" href="../code.html#airflow.hooks.PrestoHook">[docs]</a><span class="k">class</span> <span class="nc">PrestoHook</span><span class="p">(</span><span class="n">DbApiHook</span><span class="p">):</span>
+ <span class="sd">"""</span>
+<span class="sd"> Interact with Presto through PyHive!</span>
+
+<span class="sd"> >>> ph = PrestoHook()</span>
+<span class="sd"> >>> sql = "SELECT count(1) AS num FROM airflow.static_babynames"</span>
+<span class="sd"> >>> ph.get_records(sql)</span>
+<span class="sd"> [[340698]]</span>
+<span class="sd"> """</span>
+
+ <span class="n">conn_name_attr</span> <span class="o">=</span> <span class="s1">'presto_conn_id'</span>
+ <span class="n">default_conn_name</span> <span class="o">=</span> <span class="s1">'presto_default'</span>
+
+<div class="viewcode-block" id="PrestoHook.get_conn"><a class="viewcode-back" href="../code.html#airflow.hooks.PrestoHook.get_conn">[docs]</a> <span class="k">def</span> <span class="nf">get_conn</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+ <span class="sd">"""Returns a connection object"""</span>
+ <span class="n">db</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_connection</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">presto_conn_id</span><span class="p">)</span>
+ <span class="k">return</span> <span class="n">presto</span><span class="o">.</span><span class="n">connect</span><span class="p">(</span>
+ <span class="n">host</span><span class="o">=</span><span class="n">db</span><span class="o">.</span><span class="n">host</span><span class="p">,</span>
+ <span class="n">port</span><span class="o">=</span><span class="n">db</span><span class="o">.</span><span class="n">port</span><span class="p">,</span>
+ <span class="n">username</span><span class="o">=</span><span class="n">db</span><span class="o">.</span><span class="n">login</span><span class="p">,</span>
+ <span class="n">catalog</span><span class="o">=</span><span class="n">db</span><span class="o">.</span><span class="n">extra_dejson</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'catalog'</span><span class="p">,</span> <span class="s1">'hive'</span><span class="p">),</span>
+ <span class="n">schema</span><span class="o">=</span><span class="n">db</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span></div>
+
+ <span class="nd">@staticmethod</span>
+ <span class="k">def</span> <span class="nf">_strip_sql</span><span class="p">(</span><span class="n">sql</span><span class="p">):</span>
+ <span class="k">return</span> <span class="n">sql</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span><span class="o">.</span><span class="n">rstrip</span><span class="p">(</span><span class="s1">';'</span><span class="p">)</span>
+
+<div class="viewcode-block" id="PrestoHook.get_records"><a class="viewcode-back" href="../code.html#airflow.hooks.PrestoHook.get_records">[docs]</a> <span class="k">def</span> <span class="nf">get_records</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">hql</span><span class="p">,</span> <span class="n">parameters</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
+ <span class="sd">"""</span>
+<span class="sd"> Get a set of records from Presto</span>
+<span class="sd"> """</span>
+ <span class="k">try</span><span class="p">:</span>
+ <span class="k">return</span> <span class="nb">super</span><span class="p">(</span><span class="n">PrestoHook</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">get_records</span><span class="p">(</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">_strip_sql</span><span class="p">(</span><span class="n">hql</span><span class="p">),</span> <span class="n">parameters</span><span class="p">)</span>
+ <span class="k">except</span> <span class="n">DatabaseError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
+ <span class="k">if</span> <span class="p">(</span><span class="nb">hasattr</span><span class="p">(</span><span class="n">e</span><span class="p">,</span> <span class="s1">'message'</span><span class="p">)</span> <span class="ow">and</span>
+ <span class="s1">'errorName'</span> <span class="ow">in</span> <span class="n">e</span><span class="o">.</span><span class="n">message</span> <span class="ow">and</span>
+ <span class="s1">'message'</span> <span class="ow">in</span> <span class="n">e</span><span class="o">.</span><span class="n">message</span><span class="p">):</span>
+ <span class="c1"># Use the structured error data in the raised exception</span>
+ <span class="k">raise</span> <span class="n">PrestoException</span><span class="p">(</span><span class="s1">'{name}: {message}'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
+ <span class="n">name</span><span class="o">=</span><span class="n">e</span><span class="o">.</span><span class="n">message</span><span class="p">[</span><span class="s1">'errorName'</span><span class="p">],</span> <span class="n">message</span><span class="o">=</span><span class="n">e</span><span class="o">.</span><span class="n">message</span><span class="p">[</span><span class="s1">'message'</span><span class="p">]))</span>
+ <span class="k">else</span><span class="p">:</span>
+ <span class="k">raise</span> <span class="n">PrestoException</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">e</span><span class="p">))</span></div>
+
+<div class="viewcode-block" id="PrestoHook.get_first"><a class="viewcode-back" href="../code.html#airflow.hooks.PrestoHook.get_first">[docs]</a> <span class="k">def</span> <span class="nf">get_first</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">hql</span><span class="p">,</span> <span class="n">parameters</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
+ <span class="sd">"""</span>
+<span class="sd"> Returns only the first row, regardless of how many rows the query</span>
+<span class="sd"> returns.</span>
+<span class="sd"> """</span>
+ <span class="k">try</span><span class="p">:</span>
+ <span class="k">return</span> <span class="nb">super</span><span class="p">(</span><span class="n">PrestoHook</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">get_first</span><span class="p">(</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">_strip_sql</span><span class="p">(</span><span class="n">hql</span><span class="p">),</span> <span class="n">parameters</span><span class="p">)</span>
+ <span class="k">except</span> <span class="n">DatabaseError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
+ <span class="n">obj</span> <span class="o">=</span> <span class="nb">eval</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">e</span><span class="p">))</span>
+ <span class="k">raise</span> <span class="n">PrestoException</span><span class="p">(</span><span class="n">obj</span><span class="p">[</span><span class="s1">'message'</span><span class="p">])</span></div>
+
+<div class="viewcode-block" id="PrestoHook.get_pandas_df"><a class="viewcode-back" href="../code.html#airflow.hooks.PrestoHook.get_pandas_df">[docs]</a> <span class="k">def</span> <span class="nf">get_pandas_df</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">hql</span><span class="p">,</span> <span class="n">parameters</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
+ <span class="sd">"""</span>
+<span class="sd"> Get a pandas dataframe from a sql query.</span>
+<span class="sd"> """</span>
+ <span class="kn">import</span> <span class="nn">pandas</span>
+ <span class="n">cursor</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_cursor</span><span class="p">()</span>
+ <span class="k">try</span><span class="p">:</span>
+ <span class="n">cursor</span><span class="o">.</span><span class="n">execute</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_strip_sql</span><span class="p">(</span><span class="n">hql</span><span class="p">),</span> <span class="n">parameters</span><span class="p">)</span>
+ <span class="n">data</span> <span class="o">=</span> <span class="n">cursor</span><span class="o">.</span><span class="n">fetchall</span><span class="p">()</span>
+ <span class="k">except</span> <span class="n">DatabaseError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
+ <span class="n">obj</span> <span class="o">=</span> <span class="nb">eval</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">e</span><span class="p">))</span>
+ <span class="k">raise</span> <span class="n">PrestoException</span><span class="p">(</span><span class="n">obj</span><span class="p">[</span><span class="s1">'message'</span><span class="p">])</span>
+ <span class="n">column_descriptions</span> <span class="o">=</span> <span class="n">cursor</span><span class="o">.</span><span class="n">description</span>
+ <span class="k">if</span> <span class="n">data</span><span class="p">:</span>
+ <span class="n">df</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
+ <span class="n">df</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">c</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">column_descriptions</span><span class="p">]</span>
+ <span class="k">else</span><span class="p">:</span>
+ <span class="n">df</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">()</span>
+ <span class="k">return</span> <span class="n">df</span></div>
+
+<div class="viewcode-block" id="PrestoHook.run"><a class="viewcode-back" href="../code.html#airflow.hooks.PrestoHook.run">[docs]</a> <span class="k">def</span> <span class="nf">run</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">hql</span><span class="p">,</span> <span class="n">parameters</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
+ <span class="sd">"""</span>
+<span class="sd"> Execute the statement against Presto. Can be used to create views.</span>
+<span class="sd"> """</span>
+ <span class="k">return</span> <span class="nb">super</span><span class="p">(</span><span class="n">PrestoHook</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_strip_sql</span><span class="p">(</span><span class="n">hql</span><span class="p">),</span> <span class="n">parameters</span><span class="p">)</span></div>
+
+ <span class="k">def</span> <span class="nf">insert_rows</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+ <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">()</span></div>
+</pre></div>
+
+ </div>
+ </div>
+ <footer>
+
+
+ <hr/>
+
+ <div role="contentinfo">
+ <p>
+ © Copyright 2014, Maxime Beauchemin, Airbnb.
+
+ </p>
+ </div>
+ Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+
+</footer>
+
+ </div>
+ </div>
+
+ </section>
+
+ </div>
+
+
+
+
+
+ <script type="text/javascript">
+ var DOCUMENTATION_OPTIONS = {
+ URL_ROOT:'../',
+ VERSION:'',
+ COLLAPSE_INDEX:false,
+ FILE_SUFFIX:'.html',
+ HAS_SOURCE: true
+ };
+ </script>
+ <script type="text/javascript" src="../_static/jquery.js"></script>
+ <script type="text/javascript" src="../_static/underscore.js"></script>
+ <script type="text/javascript" src="../_static/doctools.js"></script>
+
+
+
+
+
+ <script type="text/javascript" src="../_static/js/theme.js"></script>
+
+
+
+
+ <script type="text/javascript">
+ jQuery(function () {
+ SphinxRtdTheme.StickyNav.enable();
+ });
+ </script>
+
+
+</body>
+</html>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-airflow-site/blob/9e19165c/_modules/python_operator.html
----------------------------------------------------------------------
diff --git a/_modules/python_operator.html b/_modules/python_operator.html
new file mode 100644
index 0000000..0a9b321
--- /dev/null
+++ b/_modules/python_operator.html
@@ -0,0 +1,338 @@
+
+
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+ <meta charset="utf-8">
+
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+ <title>python_operator — Airflow Documentation</title>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
+
+
+
+
+
+ <link rel="top" title="Airflow Documentation" href="../index.html"/>
+ <link rel="up" title="Module code" href="index.html"/>
+
+
+ <script src="../_static/js/modernizr.min.js"></script>
+
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+ <div class="wy-grid-for-nav">
+
+
+ <nav data-toggle="wy-nav-shift" class="wy-nav-side">
+ <div class="wy-side-scroll">
+ <div class="wy-side-nav-search">
+
+
+
+ <a href="../index.html" class="icon icon-home"> Airflow
+
+
+
+ </a>
+
+
+
+
+
+
+
+<div role="search">
+ <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
+ <input type="text" name="q" placeholder="Search docs" />
+ <input type="hidden" name="check_keywords" value="yes" />
+ <input type="hidden" name="area" value="default" />
+ </form>
+</div>
+
+
+ </div>
+
+ <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
+
+
+
+ <ul>
+<li class="toctree-l1"><a class="reference internal" href="../project.html">Project</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../license.html">License</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../start.html">Quick Start</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../tutorial.html">Tutorial</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../configuration.html">Configuration</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../ui.html">UI / Screenshots</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../concepts.html">Concepts</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../profiling.html">Data Profiling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../cli.html">Command Line Interface</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../scheduler.html">Scheduling & Triggers</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../plugins.html">Plugins</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../security.html">Security</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../code.html">API Reference</a></li>
+</ul>
+
+
+
+ </div>
+ </div>
+ </nav>
+
+ <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+
+ <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
+ <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+ <a href="../index.html">Airflow</a>
+ </nav>
+
+
+
+ <div class="wy-nav-content">
+ <div class="rst-content">
+
+
+
+
+
+
+<div role="navigation" aria-label="breadcrumbs navigation">
+ <ul class="wy-breadcrumbs">
+ <li><a href="../index.html">Docs</a> »</li>
+
+ <li><a href="index.html">Module code</a> »</li>
+
+ <li>python_operator</li>
+ <li class="wy-breadcrumbs-aside">
+
+
+
+ </li>
+ </ul>
+ <hr/>
+</div>
+ <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+ <div itemprop="articleBody">
+
+ <h1>Source code for python_operator</h1><div class="highlight"><pre>
+<span></span><span class="kn">from</span> <span class="nn">builtins</span> <span class="kn">import</span> <span class="nb">str</span>
+<span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span>
+<span class="kn">import</span> <span class="nn">logging</span>
+
+<span class="kn">from</span> <span class="nn">airflow.models</span> <span class="kn">import</span> <span class="n">BaseOperator</span><span class="p">,</span> <span class="n">TaskInstance</span>
+<span class="kn">from</span> <span class="nn">airflow.utils.state</span> <span class="kn">import</span> <span class="n">State</span>
+<span class="kn">from</span> <span class="nn">airflow.utils.decorators</span> <span class="kn">import</span> <span class="n">apply_defaults</span>
+<span class="kn">from</span> <span class="nn">airflow</span> <span class="kn">import</span> <span class="n">settings</span>
+
+
+<div class="viewcode-block" id="PythonOperator"><a class="viewcode-back" href="../code.html#airflow.operators.PythonOperator">[docs]</a><span class="k">class</span> <span class="nc">PythonOperator</span><span class="p">(</span><span class="n">BaseOperator</span><span class="p">):</span>
+ <span class="sd">"""</span>
+<span class="sd"> Executes a Python callable</span>
+
+<span class="sd"> :param python_callable: A reference to an object that is callable</span>
+<span class="sd"> :type python_callable: python callable</span>
+<span class="sd"> :param op_kwargs: a dictionary of keyword arguments that will get unpacked</span>
+<span class="sd"> in your function</span>
+<span class="sd"> :type op_kwargs: dict</span>
+<span class="sd"> :param op_args: a list of positional arguments that will get unpacked when</span>
+<span class="sd"> calling your callable</span>
+<span class="sd"> :type op_args: list</span>
+<span class="sd"> :param provide_context: if set to true, Airflow will pass a set of</span>
+<span class="sd"> keyword arguments that can be used in your function. This set of</span>
+<span class="sd"> kwargs correspond exactly to what you can use in your jinja</span>
+<span class="sd"> templates. For this to work, you need to define `**kwargs` in your</span>
+<span class="sd"> function header.</span>
+<span class="sd"> :type provide_context: bool</span>
+<span class="sd"> :param templates_dict: a dictionary where the values are templates that</span>
+<span class="sd"> will get templated by the Airflow engine sometime between</span>
+<span class="sd"> ``__init__`` and ``execute`` takes place and are made available</span>
+<span class="sd"> in your callable's context after the template has been applied</span>
+<span class="sd"> :type templates_dict: dict of str</span>
+<span class="sd"> :param templates_exts: a list of file extensions to resolve while</span>
+<span class="sd"> processing templated fields, for examples ``['.sql', '.hql']``</span>
+<span class="sd"> """</span>
+ <span class="n">template_fields</span> <span class="o">=</span> <span class="p">(</span><span class="s1">'templates_dict'</span><span class="p">,)</span>
+ <span class="n">template_ext</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">()</span>
+ <span class="n">ui_color</span> <span class="o">=</span> <span class="s1">'#ffefeb'</span>
+
+ <span class="nd">@apply_defaults</span>
+ <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span>
+ <span class="bp">self</span><span class="p">,</span>
+ <span class="n">python_callable</span><span class="p">,</span>
+ <span class="n">op_args</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span>
+ <span class="n">op_kwargs</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span>
+ <span class="n">provide_context</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
+ <span class="n">templates_dict</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span>
+ <span class="n">templates_exts</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span>
+ <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
+ <span class="nb">super</span><span class="p">(</span><span class="n">PythonOperator</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">python_callable</span> <span class="o">=</span> <span class="n">python_callable</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">op_args</span> <span class="o">=</span> <span class="n">op_args</span> <span class="ow">or</span> <span class="p">[]</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">op_kwargs</span> <span class="o">=</span> <span class="n">op_kwargs</span> <span class="ow">or</span> <span class="p">{}</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">provide_context</span> <span class="o">=</span> <span class="n">provide_context</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">templates_dict</span> <span class="o">=</span> <span class="n">templates_dict</span>
+ <span class="k">if</span> <span class="n">templates_exts</span><span class="p">:</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">template_ext</span> <span class="o">=</span> <span class="n">templates_exts</span>
+
+ <span class="k">def</span> <span class="nf">execute</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">context</span><span class="p">):</span>
+ <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">provide_context</span><span class="p">:</span>
+ <span class="n">context</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">op_kwargs</span><span class="p">)</span>
+ <span class="n">context</span><span class="p">[</span><span class="s1">'templates_dict'</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">templates_dict</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">op_kwargs</span> <span class="o">=</span> <span class="n">context</span>
+
+ <span class="n">return_value</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">python_callable</span><span class="p">(</span><span class="o">*</span><span class="bp">self</span><span class="o">.</span><span class="n">op_args</span><span class="p">,</span> <span class="o">**</span><span class="bp">self</span><span class="o">.</span><span class="n">op_kwargs</span><span class="p">)</span>
+ <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Done. Returned value was: "</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">return_value</span><span class="p">))</span>
+ <span class="k">return</span> <span class="n">return_value</span></div>
+
+
+<div class="viewcode-block" id="BranchPythonOperator"><a class="viewcode-back" href="../code.html#airflow.operators.BranchPythonOperator">[docs]</a><span class="k">class</span> <span class="nc">BranchPythonOperator</span><span class="p">(</span><span class="n">PythonOperator</span><span class="p">):</span>
+ <span class="sd">"""</span>
+<span class="sd"> Allows a workflow to "branch" or follow a single path following the</span>
+<span class="sd"> execution of this task.</span>
+
+<span class="sd"> It derives the PythonOperator and expects a Python function that returns</span>
+<span class="sd"> the task_id to follow. The task_id returned should point to a task</span>
+<span class="sd"> directly downstream from {self}. All other "branches" or</span>
+<span class="sd"> directly downstream tasks are marked with a state of ``skipped`` so that</span>
+<span class="sd"> these paths can't move forward. The ``skipped`` states are propageted</span>
+<span class="sd"> downstream to allow for the DAG state to fill up and the DAG run's state</span>
+<span class="sd"> to be inferred.</span>
+
+<span class="sd"> Note that using tasks with ``depends_on_past=True`` downstream from</span>
+<span class="sd"> ``BranchPythonOperator`` is logically unsound as ``skipped`` status</span>
+<span class="sd"> will invariably lead to block tasks that depend on their past successes.</span>
+<span class="sd"> ``skipped`` states propagates where all directly upstream tasks are</span>
+<span class="sd"> ``skipped``.</span>
+<span class="sd"> """</span>
+ <span class="k">def</span> <span class="nf">execute</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">context</span><span class="p">):</span>
+ <span class="n">branch</span> <span class="o">=</span> <span class="nb">super</span><span class="p">(</span><span class="n">BranchPythonOperator</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">execute</span><span class="p">(</span><span class="n">context</span><span class="p">)</span>
+ <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Following branch "</span> <span class="o">+</span> <span class="n">branch</span><span class="p">)</span>
+ <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Marking other directly downstream tasks as skipped"</span><span class="p">)</span>
+ <span class="n">session</span> <span class="o">=</span> <span class="n">settings</span><span class="o">.</span><span class="n">Session</span><span class="p">()</span>
+ <span class="k">for</span> <span class="n">task</span> <span class="ow">in</span> <span class="n">context</span><span class="p">[</span><span class="s1">'task'</span><span class="p">]</span><span class="o">.</span><span class="n">downstream_list</span><span class="p">:</span>
+ <span class="k">if</span> <span class="n">task</span><span class="o">.</span><span class="n">task_id</span> <span class="o">!=</span> <span class="n">branch</span><span class="p">:</span>
+ <span class="n">ti</span> <span class="o">=</span> <span class="n">TaskInstance</span><span class="p">(</span>
+ <span class="n">task</span><span class="p">,</span> <span class="n">execution_date</span><span class="o">=</span><span class="n">context</span><span class="p">[</span><span class="s1">'ti'</span><span class="p">]</span><span class="o">.</span><span class="n">execution_date</span><span class="p">)</span>
+ <span class="n">ti</span><span class="o">.</span><span class="n">state</span> <span class="o">=</span> <span class="n">State</span><span class="o">.</span><span class="n">SKIPPED</span>
+ <span class="n">ti</span><span class="o">.</span><span class="n">start_date</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span>
+ <span class="n">ti</span><span class="o">.</span><span class="n">end_date</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span>
+ <span class="n">session</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span><span class="n">ti</span><span class="p">)</span>
+ <span class="n">session</span><span class="o">.</span><span class="n">commit</span><span class="p">()</span>
+ <span class="n">session</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
+ <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Done."</span><span class="p">)</span></div>
+
+
+<div class="viewcode-block" id="ShortCircuitOperator"><a class="viewcode-back" href="../code.html#airflow.operators.ShortCircuitOperator">[docs]</a><span class="k">class</span> <span class="nc">ShortCircuitOperator</span><span class="p">(</span><span class="n">PythonOperator</span><span class="p">):</span>
+ <span class="sd">"""</span>
+<span class="sd"> Allows a workflow to continue only if a condition is met. Otherwise, the</span>
+<span class="sd"> workflow "short-circuits" and downstream tasks are skipped.</span>
+
+<span class="sd"> The ShortCircuitOperator is derived from the PythonOperator. It evaluates a</span>
+<span class="sd"> condition and short-circuits the workflow if the condition is False. Any</span>
+<span class="sd"> downstream tasks are marked with a state of "skipped". If the condition is</span>
+<span class="sd"> True, downstream tasks proceed as normal.</span>
+
+<span class="sd"> The condition is determined by the result of `python_callable`.</span>
+<span class="sd"> """</span>
+ <span class="k">def</span> <span class="nf">execute</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">context</span><span class="p">):</span>
+ <span class="n">condition</span> <span class="o">=</span> <span class="nb">super</span><span class="p">(</span><span class="n">ShortCircuitOperator</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">execute</span><span class="p">(</span><span class="n">context</span><span class="p">)</span>
+ <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Condition result is {}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">condition</span><span class="p">))</span>
+ <span class="k">if</span> <span class="n">condition</span><span class="p">:</span>
+ <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s1">'Proceeding with downstream tasks...'</span><span class="p">)</span>
+ <span class="k">return</span>
+ <span class="k">else</span><span class="p">:</span>
+ <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s1">'Skipping downstream tasks...'</span><span class="p">)</span>
+ <span class="n">session</span> <span class="o">=</span> <span class="n">settings</span><span class="o">.</span><span class="n">Session</span><span class="p">()</span>
+ <span class="k">for</span> <span class="n">task</span> <span class="ow">in</span> <span class="n">context</span><span class="p">[</span><span class="s1">'task'</span><span class="p">]</span><span class="o">.</span><span class="n">downstream_list</span><span class="p">:</span>
+ <span class="n">ti</span> <span class="o">=</span> <span class="n">TaskInstance</span><span class="p">(</span>
+ <span class="n">task</span><span class="p">,</span> <span class="n">execution_date</span><span class="o">=</span><span class="n">context</span><span class="p">[</span><span class="s1">'ti'</span><span class="p">]</span><span class="o">.</span><span class="n">execution_date</span><span class="p">)</span>
+ <span class="n">ti</span><span class="o">.</span><span class="n">state</span> <span class="o">=</span> <span class="n">State</span><span class="o">.</span><span class="n">SKIPPED</span>
+ <span class="n">ti</span><span class="o">.</span><span class="n">start_date</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span>
+ <span class="n">ti</span><span class="o">.</span><span class="n">end_date</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span>
+ <span class="n">session</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span><span class="n">ti</span><span class="p">)</span>
+ <span class="n">session</span><span class="o">.</span><span class="n">commit</span><span class="p">()</span>
+ <span class="n">session</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
+ <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Done."</span><span class="p">)</span></div>
+</pre></div>
+
+ </div>
+ </div>
+ <footer>
+
+
+ <hr/>
+
+ <div role="contentinfo">
+ <p>
+ © Copyright 2014, Maxime Beauchemin, Airbnb.
+
+ </p>
+ </div>
+ Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+
+</footer>
+
+ </div>
+ </div>
+
+ </section>
+
+ </div>
+
+
+
+
+
+ <script type="text/javascript">
+ var DOCUMENTATION_OPTIONS = {
+ URL_ROOT:'../',
+ VERSION:'',
+ COLLAPSE_INDEX:false,
+ FILE_SUFFIX:'.html',
+ HAS_SOURCE: true
+ };
+ </script>
+ <script type="text/javascript" src="../_static/jquery.js"></script>
+ <script type="text/javascript" src="../_static/underscore.js"></script>
+ <script type="text/javascript" src="../_static/doctools.js"></script>
+
+
+
+
+
+ <script type="text/javascript" src="../_static/js/theme.js"></script>
+
+
+
+
+ <script type="text/javascript">
+ jQuery(function () {
+ SphinxRtdTheme.StickyNav.enable();
+ });
+ </script>
+
+
+</body>
+</html>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-airflow-site/blob/9e19165c/_modules/s3_to_hive_operator.html
----------------------------------------------------------------------
diff --git a/_modules/s3_to_hive_operator.html b/_modules/s3_to_hive_operator.html
new file mode 100644
index 0000000..e0016ac
--- /dev/null
+++ b/_modules/s3_to_hive_operator.html
@@ -0,0 +1,353 @@
+
+
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+ <meta charset="utf-8">
+
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+ <title>s3_to_hive_operator — Airflow Documentation</title>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
+
+
+
+
+
+ <link rel="top" title="Airflow Documentation" href="../index.html"/>
+ <link rel="up" title="Module code" href="index.html"/>
+
+
+ <script src="../_static/js/modernizr.min.js"></script>
+
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+ <div class="wy-grid-for-nav">
+
+
+ <nav data-toggle="wy-nav-shift" class="wy-nav-side">
+ <div class="wy-side-scroll">
+ <div class="wy-side-nav-search">
+
+
+
+ <a href="../index.html" class="icon icon-home"> Airflow
+
+
+
+ </a>
+
+
+
+
+
+
+
+<div role="search">
+ <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
+ <input type="text" name="q" placeholder="Search docs" />
+ <input type="hidden" name="check_keywords" value="yes" />
+ <input type="hidden" name="area" value="default" />
+ </form>
+</div>
+
+
+ </div>
+
+ <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
+
+
+
+ <ul>
+<li class="toctree-l1"><a class="reference internal" href="../project.html">Project</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../license.html">License</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../start.html">Quick Start</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../tutorial.html">Tutorial</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../configuration.html">Configuration</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../ui.html">UI / Screenshots</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../concepts.html">Concepts</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../profiling.html">Data Profiling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../cli.html">Command Line Interface</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../scheduler.html">Scheduling & Triggers</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../plugins.html">Plugins</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../security.html">Security</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../code.html">API Reference</a></li>
+</ul>
+
+
+
+ </div>
+ </div>
+ </nav>
+
+ <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+
+ <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
+ <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+ <a href="../index.html">Airflow</a>
+ </nav>
+
+
+
+ <div class="wy-nav-content">
+ <div class="rst-content">
+
+
+
+
+
+
+<div role="navigation" aria-label="breadcrumbs navigation">
+ <ul class="wy-breadcrumbs">
+ <li><a href="../index.html">Docs</a> »</li>
+
+ <li><a href="index.html">Module code</a> »</li>
+
+ <li>s3_to_hive_operator</li>
+ <li class="wy-breadcrumbs-aside">
+
+
+
+ </li>
+ </ul>
+ <hr/>
+</div>
+ <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+ <div itemprop="articleBody">
+
+ <h1>Source code for s3_to_hive_operator</h1><div class="highlight"><pre>
+<span></span><span class="kn">from</span> <span class="nn">builtins</span> <span class="kn">import</span> <span class="nb">next</span>
+<span class="kn">from</span> <span class="nn">builtins</span> <span class="kn">import</span> <span class="nb">zip</span>
+<span class="kn">import</span> <span class="nn">logging</span>
+<span class="kn">from</span> <span class="nn">tempfile</span> <span class="kn">import</span> <span class="n">NamedTemporaryFile</span>
+
+<span class="kn">from</span> <span class="nn">airflow.exceptions</span> <span class="kn">import</span> <span class="n">AirflowException</span>
+<span class="kn">from</span> <span class="nn">airflow.hooks</span> <span class="kn">import</span> <span class="n">HiveCliHook</span><span class="p">,</span> <span class="n">S3Hook</span>
+<span class="kn">from</span> <span class="nn">airflow.models</span> <span class="kn">import</span> <span class="n">BaseOperator</span>
+<span class="kn">from</span> <span class="nn">airflow.utils.decorators</span> <span class="kn">import</span> <span class="n">apply_defaults</span>
+
+
+<div class="viewcode-block" id="S3ToHiveTransfer"><a class="viewcode-back" href="../code.html#airflow.operators.S3ToHiveTransfer">[docs]</a><span class="k">class</span> <span class="nc">S3ToHiveTransfer</span><span class="p">(</span><span class="n">BaseOperator</span><span class="p">):</span>
+ <span class="sd">"""</span>
+<span class="sd"> Moves data from S3 to Hive. The operator downloads a file from S3,</span>
+<span class="sd"> stores the file locally before loading it into a Hive table.</span>
+<span class="sd"> If the ``create`` or ``recreate`` arguments are set to ``True``,</span>
+<span class="sd"> a ``CREATE TABLE`` and ``DROP TABLE`` statements are generated.</span>
+<span class="sd"> Hive data types are inferred from the cursor's metadata from.</span>
+
+<span class="sd"> Note that the table generated in Hive uses ``STORED AS textfile``</span>
+<span class="sd"> which isn't the most efficient serialization format. If a</span>
+<span class="sd"> large amount of data is loaded and/or if the tables gets</span>
+<span class="sd"> queried considerably, you may want to use this operator only to</span>
+<span class="sd"> stage the data into a temporary table before loading it into its</span>
+<span class="sd"> final destination using a ``HiveOperator``.</span>
+
+<span class="sd"> :param s3_key: The key to be retrieved from S3</span>
+<span class="sd"> :type s3_key: str</span>
+<span class="sd"> :param field_dict: A dictionary of the fields name in the file</span>
+<span class="sd"> as keys and their Hive types as values</span>
+<span class="sd"> :type field_dict: dict</span>
+<span class="sd"> :param hive_table: target Hive table, use dot notation to target a</span>
+<span class="sd"> specific database</span>
+<span class="sd"> :type hive_table: str</span>
+<span class="sd"> :param create: whether to create the table if it doesn't exist</span>
+<span class="sd"> :type create: bool</span>
+<span class="sd"> :param recreate: whether to drop and recreate the table at every</span>
+<span class="sd"> execution</span>
+<span class="sd"> :type recreate: bool</span>
+<span class="sd"> :param partition: target partition as a dict of partition columns</span>
+<span class="sd"> and values</span>
+<span class="sd"> :type partition: dict</span>
+<span class="sd"> :param headers: whether the file contains column names on the first</span>
+<span class="sd"> line</span>
+<span class="sd"> :type headers: bool</span>
+<span class="sd"> :param check_headers: whether the column names on the first line should be</span>
+<span class="sd"> checked against the keys of field_dict</span>
+<span class="sd"> :type check_headers: bool</span>
+<span class="sd"> :param wildcard_match: whether the s3_key should be interpreted as a Unix</span>
+<span class="sd"> wildcard pattern</span>
+<span class="sd"> :type wildcard_match: bool</span>
+<span class="sd"> :param delimiter: field delimiter in the file</span>
+<span class="sd"> :type delimiter: str</span>
+<span class="sd"> :param s3_conn_id: source s3 connection</span>
+<span class="sd"> :type s3_conn_id: str</span>
+<span class="sd"> :param hive_conn_id: destination hive connection</span>
+<span class="sd"> :type hive_conn_id: str</span>
+<span class="sd"> """</span>
+
+ <span class="n">template_fields</span> <span class="o">=</span> <span class="p">(</span><span class="s1">'s3_key'</span><span class="p">,</span> <span class="s1">'partition'</span><span class="p">,</span> <span class="s1">'hive_table'</span><span class="p">)</span>
+ <span class="n">template_ext</span> <span class="o">=</span> <span class="p">()</span>
+ <span class="n">ui_color</span> <span class="o">=</span> <span class="s1">'#a0e08c'</span>
+
+ <span class="nd">@apply_defaults</span>
+ <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span>
+ <span class="bp">self</span><span class="p">,</span>
+ <span class="n">s3_key</span><span class="p">,</span>
+ <span class="n">field_dict</span><span class="p">,</span>
+ <span class="n">hive_table</span><span class="p">,</span>
+ <span class="n">delimiter</span><span class="o">=</span><span class="s1">','</span><span class="p">,</span>
+ <span class="n">create</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span>
+ <span class="n">recreate</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
+ <span class="n">partition</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span>
+ <span class="n">headers</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
+ <span class="n">check_headers</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
+ <span class="n">wildcard_match</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
+ <span class="n">s3_conn_id</span><span class="o">=</span><span class="s1">'s3_default'</span><span class="p">,</span>
+ <span class="n">hive_cli_conn_id</span><span class="o">=</span><span class="s1">'hive_cli_default'</span><span class="p">,</span>
+ <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
+ <span class="nb">super</span><span class="p">(</span><span class="n">S3ToHiveTransfer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span> <span class="o">=</span> <span class="n">s3_key</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span> <span class="o">=</span> <span class="n">field_dict</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span> <span class="o">=</span> <span class="n">hive_table</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span> <span class="o">=</span> <span class="n">delimiter</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">create</span> <span class="o">=</span> <span class="n">create</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">recreate</span> <span class="o">=</span> <span class="n">recreate</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">partition</span> <span class="o">=</span> <span class="n">partition</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">headers</span> <span class="o">=</span> <span class="n">headers</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">check_headers</span> <span class="o">=</span> <span class="n">check_headers</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">wildcard_match</span> <span class="o">=</span> <span class="n">wildcard_match</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">hive_cli_conn_id</span> <span class="o">=</span> <span class="n">hive_cli_conn_id</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">s3_conn_id</span> <span class="o">=</span> <span class="n">s3_conn_id</span>
+
+ <span class="k">def</span> <span class="nf">execute</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">context</span><span class="p">):</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">hive</span> <span class="o">=</span> <span class="n">HiveCliHook</span><span class="p">(</span><span class="n">hive_cli_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">hive_cli_conn_id</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">s3</span> <span class="o">=</span> <span class="n">S3Hook</span><span class="p">(</span><span class="n">s3_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_conn_id</span><span class="p">)</span>
+ <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Downloading S3 file"</span><span class="p">)</span>
+ <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">wildcard_match</span><span class="p">:</span>
+ <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">check_for_wildcard_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">):</span>
+ <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"No key matches {0}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">))</span>
+ <span class="n">s3_key_object</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">get_wildcard_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">)</span>
+ <span class="k">else</span><span class="p">:</span>
+ <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">check_for_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">):</span>
+ <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span>
+ <span class="s2">"The key {0} does not exists"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">))</span>
+ <span class="n">s3_key_object</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">get_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">)</span>
+ <span class="k">with</span> <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="s2">"w"</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+ <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Dumping S3 key {0} contents to local"</span>
+ <span class="s2">" file {1}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">s3_key_object</span><span class="o">.</span><span class="n">key</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">))</span>
+ <span class="n">s3_key_object</span><span class="o">.</span><span class="n">get_contents_to_file</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
+ <span class="n">f</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">connection</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
+ <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">headers</span><span class="p">:</span>
+ <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Loading file into Hive"</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="n">load_file</span><span class="p">(</span>
+ <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span><span class="p">,</span>
+ <span class="n">field_dict</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="p">,</span>
+ <span class="n">create</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">create</span><span class="p">,</span>
+ <span class="n">partition</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">partition</span><span class="p">,</span>
+ <span class="n">delimiter</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">,</span>
+ <span class="n">recreate</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">recreate</span><span class="p">)</span>
+ <span class="k">else</span><span class="p">:</span>
+ <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="s1">'r'</span><span class="p">)</span> <span class="k">as</span> <span class="n">tmpf</span><span class="p">:</span>
+ <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">check_headers</span><span class="p">:</span>
+ <span class="n">header_l</span> <span class="o">=</span> <span class="n">tmpf</span><span class="o">.</span><span class="n">readline</span><span class="p">()</span>
+ <span class="n">header_line</span> <span class="o">=</span> <span class="n">header_l</span><span class="o">.</span><span class="n">rstrip</span><span class="p">()</span>
+ <span class="n">header_list</span> <span class="o">=</span> <span class="n">header_line</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">)</span>
+ <span class="n">field_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
+ <span class="n">test_field_match</span> <span class="o">=</span> <span class="p">[</span><span class="n">h1</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="n">h2</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="k">for</span> <span class="n">h1</span><span class="p">,</span> <span class="n">h2</span>
+ <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">header_list</span><span class="p">,</span> <span class="n">field_names</span><span class="p">)]</span>
+ <span class="k">if</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="n">test_field_match</span><span class="p">):</span>
+ <span class="n">logging</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">"Headers do not match field names"</span>
+ <span class="s2">"File headers:</span><span class="se">\n</span><span class="s2"> {header_list}</span><span class="se">\n</span><span class="s2">"</span>
+ <span class="s2">"Field names: </span><span class="se">\n</span><span class="s2"> {field_names}</span><span class="se">\n</span><span class="s2">"</span>
+ <span class="s2">""</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">**</span><span class="nb">locals</span><span class="p">()))</span>
+ <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"Headers do not match the "</span>
+ <span class="s2">"field_dict keys"</span><span class="p">)</span>
+ <span class="k">with</span> <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="s2">"w"</span><span class="p">)</span> <span class="k">as</span> <span class="n">f_no_headers</span><span class="p">:</span>
+ <span class="n">tmpf</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
+ <span class="nb">next</span><span class="p">(</span><span class="n">tmpf</span><span class="p">)</span>
+ <span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="n">tmpf</span><span class="p">:</span>
+ <span class="n">f_no_headers</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">line</span><span class="p">)</span>
+ <span class="n">f_no_headers</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span>
+ <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Loading file without headers into Hive"</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="n">load_file</span><span class="p">(</span>
+ <span class="n">f_no_headers</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span><span class="p">,</span>
+ <span class="n">field_dict</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="p">,</span>
+ <span class="n">create</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">create</span><span class="p">,</span>
+ <span class="n">partition</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">partition</span><span class="p">,</span>
+ <span class="n">delimiter</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">,</span>
+ <span class="n">recreate</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">recreate</span><span class="p">)</span></div>
+</pre></div>
+
+ </div>
+ </div>
+ <footer>
+
+
+ <hr/>
+
+ <div role="contentinfo">
+ <p>
+ © Copyright 2014, Maxime Beauchemin, Airbnb.
+
+ </p>
+ </div>
+ Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+
+</footer>
+
+ </div>
+ </div>
+
+ </section>
+
+ </div>
+
+
+
+
+
+ <script type="text/javascript">
+ var DOCUMENTATION_OPTIONS = {
+ URL_ROOT:'../',
+ VERSION:'',
+ COLLAPSE_INDEX:false,
+ FILE_SUFFIX:'.html',
+ HAS_SOURCE: true
+ };
+ </script>
+ <script type="text/javascript" src="../_static/jquery.js"></script>
+ <script type="text/javascript" src="../_static/underscore.js"></script>
+ <script type="text/javascript" src="../_static/doctools.js"></script>
+
+
+
+
+
+ <script type="text/javascript" src="../_static/js/theme.js"></script>
+
+
+
+
+ <script type="text/javascript">
+ jQuery(function () {
+ SphinxRtdTheme.StickyNav.enable();
+ });
+ </script>
+
+
+</body>
+</html>
\ No newline at end of file