You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by cr...@apache.org on 2018/01/03 17:48:11 UTC
[11/35] incubator-airflow-site git commit: 1.9.0
http://git-wip-us.apache.org/repos/asf/incubator-airflow-site/blob/28a3eb60/_modules/s3_to_hive_operator.html
----------------------------------------------------------------------
diff --git a/_modules/s3_to_hive_operator.html b/_modules/s3_to_hive_operator.html
index ad7715e..840a81b 100644
--- a/_modules/s3_to_hive_operator.html
+++ b/_modules/s3_to_hive_operator.html
@@ -13,6 +13,8 @@
+
+
@@ -30,6 +32,9 @@
+ <link rel="index" title="Index"
+ href="../genindex.html"/>
+ <link rel="search" title="Search" href="../search.html"/>
<link rel="top" title="Airflow Documentation" href="../index.html"/>
<link rel="up" title="Module code" href="index.html"/>
@@ -40,6 +45,7 @@
<body class="wy-body-for-nav" role="document">
+
<div class="wy-grid-for-nav">
@@ -76,7 +82,10 @@
- <ul>
+
+
+
+ <ul>
<li class="toctree-l1"><a class="reference internal" href="../project.html">Project</a></li>
<li class="toctree-l1"><a class="reference internal" href="../license.html">License</a></li>
<li class="toctree-l1"><a class="reference internal" href="../start.html">Quick Start</a></li>
@@ -90,6 +99,8 @@
<li class="toctree-l1"><a class="reference internal" href="../scheduler.html">Scheduling & Triggers</a></li>
<li class="toctree-l1"><a class="reference internal" href="../plugins.html">Plugins</a></li>
<li class="toctree-l1"><a class="reference internal" href="../security.html">Security</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../api.html">Experimental Rest API</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../integration.html">Integration</a></li>
<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../code.html">API Reference</a></li>
</ul>
@@ -104,8 +115,10 @@
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
- <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
- <a href="../index.html">Airflow</a>
+
+ <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+ <a href="../index.html">Airflow</a>
+
</nav>
@@ -118,19 +131,36 @@
+
+
+
+
+
+
+
+
+
+
<div role="navigation" aria-label="breadcrumbs navigation">
+
<ul class="wy-breadcrumbs">
- <li><a href="../index.html">Docs</a> »</li>
-
+
+ <li><a href="../index.html">Docs</a> »</li>
+
<li><a href="index.html">Module code</a> »</li>
-
- <li>s3_to_hive_operator</li>
+
+ <li>s3_to_hive_operator</li>
+
+
<li class="wy-breadcrumbs-aside">
-
+
</li>
+
</ul>
+
+
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
@@ -151,16 +181,21 @@
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
-<span class="kn">from</span> <span class="nn">builtins</span> <span class="kn">import</span> <span class="nb">next</span>
-<span class="kn">from</span> <span class="nn">builtins</span> <span class="kn">import</span> <span class="nb">zip</span>
-<span class="kn">import</span> <span class="nn">logging</span>
-<span class="kn">from</span> <span class="nn">tempfile</span> <span class="kn">import</span> <span class="n">NamedTemporaryFile</span>
+<span class="kn">from</span> <span class="nn">builtins</span> <span class="k">import</span> <span class="nb">next</span>
+<span class="kn">from</span> <span class="nn">builtins</span> <span class="k">import</span> <span class="nb">zip</span>
+<span class="kn">from</span> <span class="nn">tempfile</span> <span class="k">import</span> <span class="n">NamedTemporaryFile</span>
+<span class="kn">from</span> <span class="nn">airflow.utils.file</span> <span class="k">import</span> <span class="n">TemporaryDirectory</span>
+<span class="kn">import</span> <span class="nn">gzip</span>
+<span class="kn">import</span> <span class="nn">bz2</span>
+<span class="kn">import</span> <span class="nn">tempfile</span>
+<span class="kn">import</span> <span class="nn">os</span>
-<span class="kn">from</span> <span class="nn">airflow.exceptions</span> <span class="kn">import</span> <span class="n">AirflowException</span>
-<span class="kn">from</span> <span class="nn">airflow.hooks.S3_hook</span> <span class="kn">import</span> <span class="n">S3Hook</span>
-<span class="kn">from</span> <span class="nn">airflow.hooks.hive_hooks</span> <span class="kn">import</span> <span class="n">HiveCliHook</span>
-<span class="kn">from</span> <span class="nn">airflow.models</span> <span class="kn">import</span> <span class="n">BaseOperator</span>
-<span class="kn">from</span> <span class="nn">airflow.utils.decorators</span> <span class="kn">import</span> <span class="n">apply_defaults</span>
+<span class="kn">from</span> <span class="nn">airflow.exceptions</span> <span class="k">import</span> <span class="n">AirflowException</span>
+<span class="kn">from</span> <span class="nn">airflow.hooks.S3_hook</span> <span class="k">import</span> <span class="n">S3Hook</span>
+<span class="kn">from</span> <span class="nn">airflow.hooks.hive_hooks</span> <span class="k">import</span> <span class="n">HiveCliHook</span>
+<span class="kn">from</span> <span class="nn">airflow.models</span> <span class="k">import</span> <span class="n">BaseOperator</span>
+<span class="kn">from</span> <span class="nn">airflow.utils.decorators</span> <span class="k">import</span> <span class="n">apply_defaults</span>
+<span class="kn">from</span> <span class="nn">airflow.utils.compression</span> <span class="k">import</span> <span class="n">uncompress_file</span>
<div class="viewcode-block" id="S3ToHiveTransfer"><a class="viewcode-back" href="../code.html#airflow.operators.S3ToHiveTransfer">[docs]</a><span class="k">class</span> <span class="nc">S3ToHiveTransfer</span><span class="p">(</span><span class="n">BaseOperator</span><span class="p">):</span>
@@ -205,10 +240,15 @@
<span class="sd"> :type wildcard_match: bool</span>
<span class="sd"> :param delimiter: field delimiter in the file</span>
<span class="sd"> :type delimiter: str</span>
-<span class="sd"> :param s3_conn_id: source s3 connection</span>
-<span class="sd"> :type s3_conn_id: str</span>
-<span class="sd"> :param hive_conn_id: destination hive connection</span>
-<span class="sd"> :type hive_conn_id: str</span>
+<span class="sd"> :param aws_conn_id: source s3 connection</span>
+<span class="sd"> :type aws_conn_id: str</span>
+<span class="sd"> :param hive_cli_conn_id: destination hive connection</span>
+<span class="sd"> :type hive_cli_conn_id: str</span>
+<span class="sd"> :param input_compressed: Boolean to determine if file decompression is</span>
+<span class="sd"> required to process headers</span>
+<span class="sd"> :type input_compressed: bool</span>
+<span class="sd"> :param tblproperties: TBLPROPERTIES of the hive table being created</span>
+<span class="sd"> :type tblproperties: dict</span>
<span class="sd"> """</span>
<span class="n">template_fields</span> <span class="o">=</span> <span class="p">(</span><span class="s1">'s3_key'</span><span class="p">,</span> <span class="s1">'partition'</span><span class="p">,</span> <span class="s1">'hive_table'</span><span class="p">)</span>
@@ -222,16 +262,18 @@
<span class="n">field_dict</span><span class="p">,</span>
<span class="n">hive_table</span><span class="p">,</span>
<span class="n">delimiter</span><span class="o">=</span><span class="s1">','</span><span class="p">,</span>
- <span class="n">create</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span>
- <span class="n">recreate</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
- <span class="n">partition</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span>
- <span class="n">headers</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
- <span class="n">check_headers</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
- <span class="n">wildcard_match</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
- <span class="n">s3_conn_id</span><span class="o">=</span><span class="s1">'s3_default'</span><span class="p">,</span>
+ <span class="n">create</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+ <span class="n">recreate</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+ <span class="n">partition</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+ <span class="n">headers</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+ <span class="n">check_headers</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+ <span class="n">wildcard_match</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+ <span class="n">aws_conn_id</span><span class="o">=</span><span class="s1">'aws_default'</span><span class="p">,</span>
<span class="n">hive_cli_conn_id</span><span class="o">=</span><span class="s1">'hive_cli_default'</span><span class="p">,</span>
+ <span class="n">input_compressed</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+ <span class="n">tblproperties</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
- <span class="nb">super</span><span class="p">(</span><span class="n">S3ToHiveTransfer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
+ <span class="nb">super</span><span class="p">(</span><span class="n">S3ToHiveTransfer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span> <span class="o">=</span> <span class="n">s3_key</span>
<span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span> <span class="o">=</span> <span class="n">field_dict</span>
<span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span> <span class="o">=</span> <span class="n">hive_table</span>
@@ -243,29 +285,42 @@
<span class="bp">self</span><span class="o">.</span><span class="n">check_headers</span> <span class="o">=</span> <span class="n">check_headers</span>
<span class="bp">self</span><span class="o">.</span><span class="n">wildcard_match</span> <span class="o">=</span> <span class="n">wildcard_match</span>
<span class="bp">self</span><span class="o">.</span><span class="n">hive_cli_conn_id</span> <span class="o">=</span> <span class="n">hive_cli_conn_id</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">s3_conn_id</span> <span class="o">=</span> <span class="n">s3_conn_id</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">aws_conn_id</span> <span class="o">=</span> <span class="n">aws_conn_id</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">input_compressed</span> <span class="o">=</span> <span class="n">input_compressed</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">tblproperties</span> <span class="o">=</span> <span class="n">tblproperties</span>
+
+ <span class="k">if</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">check_headers</span> <span class="ow">and</span>
+ <span class="ow">not</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">headers</span><span class="p">)):</span>
+ <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"To check_headers provide "</span> <span class="o">+</span>
+ <span class="s2">"field_dict and headers"</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">execute</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">context</span><span class="p">):</span>
+ <span class="c1"># Downloading file from S3</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">s3</span> <span class="o">=</span> <span class="n">S3Hook</span><span class="p">(</span><span class="n">aws_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">aws_conn_id</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">hive</span> <span class="o">=</span> <span class="n">HiveCliHook</span><span class="p">(</span><span class="n">hive_cli_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">hive_cli_conn_id</span><span class="p">)</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">s3</span> <span class="o">=</span> <span class="n">S3Hook</span><span class="p">(</span><span class="n">s3_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_conn_id</span><span class="p">)</span>
- <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Downloading S3 file"</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Downloading S3 file"</span><span class="p">)</span>
+
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">wildcard_match</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">check_for_wildcard_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">):</span>
- <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"No key matches {0}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">))</span>
+ <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"No key matches </span><span class="si">{0}</span><span class="s2">"</span>
+ <span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">))</span>
<span class="n">s3_key_object</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">get_wildcard_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">check_for_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">):</span>
<span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span>
- <span class="s2">"The key {0} does not exists"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">))</span>
+ <span class="s2">"The key </span><span class="si">{0}</span><span class="s2"> does not exists"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">))</span>
<span class="n">s3_key_object</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">get_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">)</span>
- <span class="k">with</span> <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="s2">"w"</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
- <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Dumping S3 key {0} contents to local"</span>
- <span class="s2">" file {1}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">s3_key_object</span><span class="o">.</span><span class="n">key</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">))</span>
- <span class="n">s3_key_object</span><span class="o">.</span><span class="n">get_contents_to_file</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
+ <span class="n">root</span><span class="p">,</span> <span class="n">file_ext</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">splitext</span><span class="p">(</span><span class="n">s3_key_object</span><span class="o">.</span><span class="n">key</span><span class="p">)</span>
+ <span class="k">with</span> <span class="n">TemporaryDirectory</span><span class="p">(</span><span class="n">prefix</span><span class="o">=</span><span class="s1">'tmps32hive_'</span><span class="p">)</span> <span class="k">as</span> <span class="n">tmp_dir</span><span class="p">,</span>\
+ <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"wb"</span><span class="p">,</span>
+ <span class="nb">dir</span><span class="o">=</span><span class="n">tmp_dir</span><span class="p">,</span>
+ <span class="n">suffix</span><span class="o">=</span><span class="n">file_ext</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Dumping S3 key </span><span class="si">{0}</span><span class="s2"> contents to local file </span><span class="si">{1}</span><span class="s2">"</span>
+ <span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">s3_key_object</span><span class="o">.</span><span class="n">key</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">))</span>
+ <span class="n">s3_key_object</span><span class="o">.</span><span class="n">download_fileobj</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="n">f</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">connection</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">headers</span><span class="p">:</span>
- <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Loading file into Hive"</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Loading file </span><span class="si">%s</span><span class="s2"> into Hive"</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="n">load_file</span><span class="p">(</span>
<span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span><span class="p">,</span>
@@ -273,41 +328,100 @@
<span class="n">create</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">create</span><span class="p">,</span>
<span class="n">partition</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">partition</span><span class="p">,</span>
<span class="n">delimiter</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">,</span>
- <span class="n">recreate</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">recreate</span><span class="p">)</span>
+ <span class="n">recreate</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">recreate</span><span class="p">,</span>
+ <span class="n">tblproperties</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">tblproperties</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
- <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="s1">'r'</span><span class="p">)</span> <span class="k">as</span> <span class="n">tmpf</span><span class="p">:</span>
- <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">check_headers</span><span class="p">:</span>
- <span class="n">header_l</span> <span class="o">=</span> <span class="n">tmpf</span><span class="o">.</span><span class="n">readline</span><span class="p">()</span>
- <span class="n">header_line</span> <span class="o">=</span> <span class="n">header_l</span><span class="o">.</span><span class="n">rstrip</span><span class="p">()</span>
- <span class="n">header_list</span> <span class="o">=</span> <span class="n">header_line</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">)</span>
- <span class="n">field_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
- <span class="n">test_field_match</span> <span class="o">=</span> <span class="p">[</span><span class="n">h1</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="n">h2</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="k">for</span> <span class="n">h1</span><span class="p">,</span> <span class="n">h2</span>
- <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">header_list</span><span class="p">,</span> <span class="n">field_names</span><span class="p">)]</span>
- <span class="k">if</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="n">test_field_match</span><span class="p">):</span>
- <span class="n">logging</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">"Headers do not match field names"</span>
- <span class="s2">"File headers:</span><span class="se">\n</span><span class="s2"> {header_list}</span><span class="se">\n</span><span class="s2">"</span>
- <span class="s2">"Field names: </span><span class="se">\n</span><span class="s2"> {field_names}</span><span class="se">\n</span><span class="s2">"</span>
- <span class="s2">""</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">**</span><span class="nb">locals</span><span class="p">()))</span>
- <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"Headers do not match the "</span>
- <span class="s2">"field_dict keys"</span><span class="p">)</span>
- <span class="k">with</span> <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="s2">"w"</span><span class="p">)</span> <span class="k">as</span> <span class="n">f_no_headers</span><span class="p">:</span>
- <span class="n">tmpf</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
- <span class="nb">next</span><span class="p">(</span><span class="n">tmpf</span><span class="p">)</span>
- <span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="n">tmpf</span><span class="p">:</span>
- <span class="n">f_no_headers</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">line</span><span class="p">)</span>
- <span class="n">f_no_headers</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span>
- <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Loading file without headers into Hive"</span><span class="p">)</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="n">load_file</span><span class="p">(</span>
- <span class="n">f_no_headers</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span><span class="p">,</span>
- <span class="n">field_dict</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="p">,</span>
- <span class="n">create</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">create</span><span class="p">,</span>
- <span class="n">partition</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">partition</span><span class="p">,</span>
- <span class="n">delimiter</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">,</span>
- <span class="n">recreate</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">recreate</span><span class="p">)</span></div>
+ <span class="c1"># Decompressing file</span>
+ <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_compressed</span><span class="p">:</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Uncompressing file </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
+ <span class="n">fn_uncompressed</span> <span class="o">=</span> <span class="n">uncompress_file</span><span class="p">(</span><span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
+ <span class="n">file_ext</span><span class="p">,</span>
+ <span class="n">tmp_dir</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Uncompressed to </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span> <span class="n">fn_uncompressed</span><span class="p">)</span>
+ <span class="c1"># uncompressed file available now so deleting</span>
+ <span class="c1"># compressed file to save disk space</span>
+ <span class="n">f</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
+ <span class="k">else</span><span class="p">:</span>
+ <span class="n">fn_uncompressed</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span>
+
+ <span class="c1"># Testing if header matches field_dict</span>
+ <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">check_headers</span><span class="p">:</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Matching file header against field_dict"</span><span class="p">)</span>
+ <span class="n">header_list</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_top_row_as_list</span><span class="p">(</span><span class="n">fn_uncompressed</span><span class="p">)</span>
+ <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_match_headers</span><span class="p">(</span><span class="n">header_list</span><span class="p">):</span>
+ <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"Header check failed"</span><span class="p">)</span>
+
+ <span class="c1"># Deleting top header row</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Removing header from file </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span> <span class="n">fn_uncompressed</span><span class="p">)</span>
+ <span class="n">headless_file</span> <span class="o">=</span> <span class="p">(</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">_delete_top_row_and_compress</span><span class="p">(</span><span class="n">fn_uncompressed</span><span class="p">,</span>
+ <span class="n">file_ext</span><span class="p">,</span>
+ <span class="n">tmp_dir</span><span class="p">))</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Headless file </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span> <span class="n">headless_file</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Loading file </span><span class="si">%s</span><span class="s2"> into Hive"</span><span class="p">,</span> <span class="n">headless_file</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="n">load_file</span><span class="p">(</span><span class="n">headless_file</span><span class="p">,</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span><span class="p">,</span>
+ <span class="n">field_dict</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="p">,</span>
+ <span class="n">create</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">create</span><span class="p">,</span>
+ <span class="n">partition</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">partition</span><span class="p">,</span>
+ <span class="n">delimiter</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">,</span>
+ <span class="n">recreate</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">recreate</span><span class="p">,</span>
+ <span class="n">tblproperties</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">tblproperties</span><span class="p">)</span>
+
+ <span class="k">def</span> <span class="nf">_get_top_row_as_list</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_name</span><span class="p">):</span>
+ <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">file_name</span><span class="p">,</span> <span class="s1">'rt'</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+ <span class="n">header_line</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">readline</span><span class="p">()</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
+ <span class="n">header_list</span> <span class="o">=</span> <span class="n">header_line</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">)</span>
+ <span class="k">return</span> <span class="n">header_list</span>
+
+ <span class="k">def</span> <span class="nf">_match_headers</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">header_list</span><span class="p">):</span>
+ <span class="k">if</span> <span class="ow">not</span> <span class="n">header_list</span><span class="p">:</span>
+ <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"Unable to retrieve header row from file"</span><span class="p">)</span>
+ <span class="n">field_names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="o">.</span><span class="n">keys</span><span class="p">()</span>
+ <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">field_names</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">header_list</span><span class="p">):</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">"Headers count mismatch"</span>
+ <span class="s2">"File headers:</span><span class="se">\n</span><span class="s2"> </span><span class="si">{header_list}</span><span class="se">\n</span><span class="s2">"</span>
+ <span class="s2">"Field names: </span><span class="se">\n</span><span class="s2"> </span><span class="si">{field_names}</span><span class="se">\n</span><span class="s2">"</span>
+ <span class="s2">""</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">**</span><span class="nb">locals</span><span class="p">()))</span>
+ <span class="k">return</span> <span class="kc">False</span>
+ <span class="n">test_field_match</span> <span class="o">=</span> <span class="p">[</span><span class="n">h1</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="n">h2</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span>
+ <span class="k">for</span> <span class="n">h1</span><span class="p">,</span> <span class="n">h2</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">header_list</span><span class="p">,</span> <span class="n">field_names</span><span class="p">)]</span>
+ <span class="k">if</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="n">test_field_match</span><span class="p">):</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">"Headers do not match field names"</span>
+ <span class="s2">"File headers:</span><span class="se">\n</span><span class="s2"> </span><span class="si">{header_list}</span><span class="se">\n</span><span class="s2">"</span>
+ <span class="s2">"Field names: </span><span class="se">\n</span><span class="s2"> </span><span class="si">{field_names}</span><span class="se">\n</span><span class="s2">"</span>
+ <span class="s2">""</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">**</span><span class="nb">locals</span><span class="p">()))</span>
+ <span class="k">return</span> <span class="kc">False</span>
+ <span class="k">else</span><span class="p">:</span>
+ <span class="k">return</span> <span class="kc">True</span>
+
+ <span class="k">def</span> <span class="nf">_delete_top_row_and_compress</span><span class="p">(</span>
+ <span class="bp">self</span><span class="p">,</span>
+ <span class="n">input_file_name</span><span class="p">,</span>
+ <span class="n">output_file_ext</span><span class="p">,</span>
+ <span class="n">dest_dir</span><span class="p">):</span>
+ <span class="c1"># When output_file_ext is not defined, file is not compressed</span>
+ <span class="n">open_fn</span> <span class="o">=</span> <span class="nb">open</span>
+ <span class="k">if</span> <span class="n">output_file_ext</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="s1">'.gz'</span><span class="p">:</span>
+ <span class="n">open_fn</span> <span class="o">=</span> <span class="n">gzip</span><span class="o">.</span><span class="n">GzipFile</span>
+ <span class="k">elif</span> <span class="n">output_file_ext</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="s1">'.bz2'</span><span class="p">:</span>
+ <span class="n">open_fn</span> <span class="o">=</span> <span class="n">bz2</span><span class="o">.</span><span class="n">BZ2File</span>
+
+ <span class="n">os_fh_output</span><span class="p">,</span> <span class="n">fn_output</span> <span class="o">=</span> \
+ <span class="n">tempfile</span><span class="o">.</span><span class="n">mkstemp</span><span class="p">(</span><span class="n">suffix</span><span class="o">=</span><span class="n">output_file_ext</span><span class="p">,</span> <span class="nb">dir</span><span class="o">=</span><span class="n">dest_dir</span><span class="p">)</span>
+ <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">input_file_name</span><span class="p">,</span> <span class="s1">'rb'</span><span class="p">)</span> <span class="k">as</span> <span class="n">f_in</span><span class="p">,</span>\
+ <span class="n">open_fn</span><span class="p">(</span><span class="n">fn_output</span><span class="p">,</span> <span class="s1">'wb'</span><span class="p">)</span> <span class="k">as</span> <span class="n">f_out</span><span class="p">:</span>
+ <span class="n">f_in</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
+ <span class="nb">next</span><span class="p">(</span><span class="n">f_in</span><span class="p">)</span>
+ <span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="n">f_in</span><span class="p">:</span>
+ <span class="n">f_out</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">line</span><span class="p">)</span>
+ <span class="k">return</span> <span class="n">fn_output</span></div>
</pre></div>
</div>
+ <div class="articleComments">
+
+ </div>
</div>
<footer>
@@ -340,7 +454,8 @@
VERSION:'',
COLLAPSE_INDEX:false,
FILE_SUFFIX:'.html',
- HAS_SOURCE: true
+ HAS_SOURCE: true,
+ SOURCELINK_SUFFIX: '.txt'
};
</script>
<script type="text/javascript" src="../_static/jquery.js"></script>