You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by cr...@apache.org on 2018/01/03 17:48:11 UTC

[11/35] incubator-airflow-site git commit: 1.9.0

http://git-wip-us.apache.org/repos/asf/incubator-airflow-site/blob/28a3eb60/_modules/s3_to_hive_operator.html
----------------------------------------------------------------------
diff --git a/_modules/s3_to_hive_operator.html b/_modules/s3_to_hive_operator.html
index ad7715e..840a81b 100644
--- a/_modules/s3_to_hive_operator.html
+++ b/_modules/s3_to_hive_operator.html
@@ -13,6 +13,8 @@
 
   
   
+  
+  
 
   
 
@@ -30,6 +32,9 @@
   
 
   
+        <link rel="index" title="Index"
+              href="../genindex.html"/>
+        <link rel="search" title="Search" href="../search.html"/>
     <link rel="top" title="Airflow Documentation" href="../index.html"/>
         <link rel="up" title="Module code" href="index.html"/> 
 
@@ -40,6 +45,7 @@
 
 <body class="wy-body-for-nav" role="document">
 
+   
   <div class="wy-grid-for-nav">
 
     
@@ -76,7 +82,10 @@
           
             
             
-                <ul>
+              
+            
+            
+              <ul>
 <li class="toctree-l1"><a class="reference internal" href="../project.html">Project</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../license.html">License</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../start.html">Quick Start</a></li>
@@ -90,6 +99,8 @@
 <li class="toctree-l1"><a class="reference internal" href="../scheduler.html">Scheduling &amp; Triggers</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../plugins.html">Plugins</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../security.html">Security</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../api.html">Experimental Rest API</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../integration.html">Integration</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../code.html">API Reference</a></li>
 </ul>
@@ -104,8 +115,10 @@
 
       
       <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
-        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
-        <a href="../index.html">Airflow</a>
+        
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../index.html">Airflow</a>
+        
       </nav>
 
 
@@ -118,19 +131,36 @@
 
 
 
+
+
+
+
+
+
+
+
+
+
 <div role="navigation" aria-label="breadcrumbs navigation">
+
   <ul class="wy-breadcrumbs">
-    <li><a href="../index.html">Docs</a> &raquo;</li>
-      
+    
+      <li><a href="../index.html">Docs</a> &raquo;</li>
+        
           <li><a href="index.html">Module code</a> &raquo;</li>
-      
-    <li>s3_to_hive_operator</li>
+        
+      <li>s3_to_hive_operator</li>
+    
+    
       <li class="wy-breadcrumbs-aside">
         
-          
+            
         
       </li>
+    
   </ul>
+
+  
   <hr/>
 </div>
           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
@@ -151,16 +181,21 @@
 <span class="c1"># See the License for the specific language governing permissions and</span>
 <span class="c1"># limitations under the License.</span>
 
-<span class="kn">from</span> <span class="nn">builtins</span> <span class="kn">import</span> <span class="nb">next</span>
-<span class="kn">from</span> <span class="nn">builtins</span> <span class="kn">import</span> <span class="nb">zip</span>
-<span class="kn">import</span> <span class="nn">logging</span>
-<span class="kn">from</span> <span class="nn">tempfile</span> <span class="kn">import</span> <span class="n">NamedTemporaryFile</span>
+<span class="kn">from</span> <span class="nn">builtins</span> <span class="k">import</span> <span class="nb">next</span>
+<span class="kn">from</span> <span class="nn">builtins</span> <span class="k">import</span> <span class="nb">zip</span>
+<span class="kn">from</span> <span class="nn">tempfile</span> <span class="k">import</span> <span class="n">NamedTemporaryFile</span>
+<span class="kn">from</span> <span class="nn">airflow.utils.file</span> <span class="k">import</span> <span class="n">TemporaryDirectory</span>
+<span class="kn">import</span> <span class="nn">gzip</span>
+<span class="kn">import</span> <span class="nn">bz2</span>
+<span class="kn">import</span> <span class="nn">tempfile</span>
+<span class="kn">import</span> <span class="nn">os</span>
 
-<span class="kn">from</span> <span class="nn">airflow.exceptions</span> <span class="kn">import</span> <span class="n">AirflowException</span>
-<span class="kn">from</span> <span class="nn">airflow.hooks.S3_hook</span> <span class="kn">import</span> <span class="n">S3Hook</span>
-<span class="kn">from</span> <span class="nn">airflow.hooks.hive_hooks</span> <span class="kn">import</span> <span class="n">HiveCliHook</span>
-<span class="kn">from</span> <span class="nn">airflow.models</span> <span class="kn">import</span> <span class="n">BaseOperator</span>
-<span class="kn">from</span> <span class="nn">airflow.utils.decorators</span> <span class="kn">import</span> <span class="n">apply_defaults</span>
+<span class="kn">from</span> <span class="nn">airflow.exceptions</span> <span class="k">import</span> <span class="n">AirflowException</span>
+<span class="kn">from</span> <span class="nn">airflow.hooks.S3_hook</span> <span class="k">import</span> <span class="n">S3Hook</span>
+<span class="kn">from</span> <span class="nn">airflow.hooks.hive_hooks</span> <span class="k">import</span> <span class="n">HiveCliHook</span>
+<span class="kn">from</span> <span class="nn">airflow.models</span> <span class="k">import</span> <span class="n">BaseOperator</span>
+<span class="kn">from</span> <span class="nn">airflow.utils.decorators</span> <span class="k">import</span> <span class="n">apply_defaults</span>
+<span class="kn">from</span> <span class="nn">airflow.utils.compression</span> <span class="k">import</span> <span class="n">uncompress_file</span>
 
 
 <div class="viewcode-block" id="S3ToHiveTransfer"><a class="viewcode-back" href="../code.html#airflow.operators.S3ToHiveTransfer">[docs]</a><span class="k">class</span> <span class="nc">S3ToHiveTransfer</span><span class="p">(</span><span class="n">BaseOperator</span><span class="p">):</span>
@@ -205,10 +240,15 @@
 <span class="sd">    :type wildcard_match: bool</span>
 <span class="sd">    :param delimiter: field delimiter in the file</span>
 <span class="sd">    :type delimiter: str</span>
-<span class="sd">    :param s3_conn_id: source s3 connection</span>
-<span class="sd">    :type s3_conn_id: str</span>
-<span class="sd">    :param hive_conn_id: destination hive connection</span>
-<span class="sd">    :type hive_conn_id: str</span>
+<span class="sd">    :param aws_conn_id: source s3 connection</span>
+<span class="sd">    :type aws_conn_id: str</span>
+<span class="sd">    :param hive_cli_conn_id: destination hive connection</span>
+<span class="sd">    :type hive_cli_conn_id: str</span>
+<span class="sd">    :param input_compressed: Boolean to determine if file decompression is</span>
+<span class="sd">        required to process headers</span>
+<span class="sd">    :type input_compressed: bool</span>
+<span class="sd">    :param tblproperties: TBLPROPERTIES of the hive table being created</span>
+<span class="sd">    :type tblproperties: dict</span>
 <span class="sd">    &quot;&quot;&quot;</span>
 
     <span class="n">template_fields</span> <span class="o">=</span> <span class="p">(</span><span class="s1">&#39;s3_key&#39;</span><span class="p">,</span> <span class="s1">&#39;partition&#39;</span><span class="p">,</span> <span class="s1">&#39;hive_table&#39;</span><span class="p">)</span>
@@ -222,16 +262,18 @@
             <span class="n">field_dict</span><span class="p">,</span>
             <span class="n">hive_table</span><span class="p">,</span>
             <span class="n">delimiter</span><span class="o">=</span><span class="s1">&#39;,&#39;</span><span class="p">,</span>
-            <span class="n">create</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span>
-            <span class="n">recreate</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
-            <span class="n">partition</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span>
-            <span class="n">headers</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
-            <span class="n">check_headers</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
-            <span class="n">wildcard_match</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
-            <span class="n">s3_conn_id</span><span class="o">=</span><span class="s1">&#39;s3_default&#39;</span><span class="p">,</span>
+            <span class="n">create</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+            <span class="n">recreate</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+            <span class="n">partition</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+            <span class="n">headers</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+            <span class="n">check_headers</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+            <span class="n">wildcard_match</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+            <span class="n">aws_conn_id</span><span class="o">=</span><span class="s1">&#39;aws_default&#39;</span><span class="p">,</span>
             <span class="n">hive_cli_conn_id</span><span class="o">=</span><span class="s1">&#39;hive_cli_default&#39;</span><span class="p">,</span>
+            <span class="n">input_compressed</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+            <span class="n">tblproperties</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
             <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
-        <span class="nb">super</span><span class="p">(</span><span class="n">S3ToHiveTransfer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
+        <span class="nb">super</span><span class="p">(</span><span class="n">S3ToHiveTransfer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span> <span class="o">=</span> <span class="n">s3_key</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span> <span class="o">=</span> <span class="n">field_dict</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span> <span class="o">=</span> <span class="n">hive_table</span>
@@ -243,29 +285,42 @@
         <span class="bp">self</span><span class="o">.</span><span class="n">check_headers</span> <span class="o">=</span> <span class="n">check_headers</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">wildcard_match</span> <span class="o">=</span> <span class="n">wildcard_match</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">hive_cli_conn_id</span> <span class="o">=</span> <span class="n">hive_cli_conn_id</span>
-        <span class="bp">self</span><span class="o">.</span><span class="n">s3_conn_id</span> <span class="o">=</span> <span class="n">s3_conn_id</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">aws_conn_id</span> <span class="o">=</span> <span class="n">aws_conn_id</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">input_compressed</span> <span class="o">=</span> <span class="n">input_compressed</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">tblproperties</span> <span class="o">=</span> <span class="n">tblproperties</span>
+
+        <span class="k">if</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">check_headers</span> <span class="ow">and</span>
+                <span class="ow">not</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">headers</span><span class="p">)):</span>
+            <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">&quot;To check_headers provide &quot;</span> <span class="o">+</span>
+                                   <span class="s2">&quot;field_dict and headers&quot;</span><span class="p">)</span>
 
     <span class="k">def</span> <span class="nf">execute</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">context</span><span class="p">):</span>
+        <span class="c1"># Downloading file from S3</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">s3</span> <span class="o">=</span> <span class="n">S3Hook</span><span class="p">(</span><span class="n">aws_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">aws_conn_id</span><span class="p">)</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">hive</span> <span class="o">=</span> <span class="n">HiveCliHook</span><span class="p">(</span><span class="n">hive_cli_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">hive_cli_conn_id</span><span class="p">)</span>
-        <span class="bp">self</span><span class="o">.</span><span class="n">s3</span> <span class="o">=</span> <span class="n">S3Hook</span><span class="p">(</span><span class="n">s3_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_conn_id</span><span class="p">)</span>
-        <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Downloading S3 file&quot;</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Downloading S3 file&quot;</span><span class="p">)</span>
+
         <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">wildcard_match</span><span class="p">:</span>
             <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">check_for_wildcard_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">):</span>
-                <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">&quot;No key matches {0}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">))</span>
+                <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">&quot;No key matches </span><span class="si">{0}</span><span class="s2">&quot;</span>
+                                       <span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">))</span>
             <span class="n">s3_key_object</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">get_wildcard_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">)</span>
         <span class="k">else</span><span class="p">:</span>
             <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">check_for_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">):</span>
                 <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span>
-                    <span class="s2">&quot;The key {0} does not exists&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">))</span>
+                    <span class="s2">&quot;The key </span><span class="si">{0}</span><span class="s2"> does not exists&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">))</span>
             <span class="n">s3_key_object</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">get_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">)</span>
-        <span class="k">with</span> <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
-            <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Dumping S3 key {0} contents to local&quot;</span>
-                         <span class="s2">&quot; file {1}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">s3_key_object</span><span class="o">.</span><span class="n">key</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">))</span>
-            <span class="n">s3_key_object</span><span class="o">.</span><span class="n">get_contents_to_file</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
+        <span class="n">root</span><span class="p">,</span> <span class="n">file_ext</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">splitext</span><span class="p">(</span><span class="n">s3_key_object</span><span class="o">.</span><span class="n">key</span><span class="p">)</span>
+        <span class="k">with</span> <span class="n">TemporaryDirectory</span><span class="p">(</span><span class="n">prefix</span><span class="o">=</span><span class="s1">&#39;tmps32hive_&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">tmp_dir</span><span class="p">,</span>\
+                <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">&quot;wb&quot;</span><span class="p">,</span>
+                                   <span class="nb">dir</span><span class="o">=</span><span class="n">tmp_dir</span><span class="p">,</span>
+                                   <span class="n">suffix</span><span class="o">=</span><span class="n">file_ext</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Dumping S3 key </span><span class="si">{0}</span><span class="s2"> contents to local file </span><span class="si">{1}</span><span class="s2">&quot;</span>
+                          <span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">s3_key_object</span><span class="o">.</span><span class="n">key</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">))</span>
+            <span class="n">s3_key_object</span><span class="o">.</span><span class="n">download_fileobj</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
             <span class="n">f</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span>
-            <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">connection</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
             <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">headers</span><span class="p">:</span>
-                <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Loading file into Hive&quot;</span><span class="p">)</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Loading file </span><span class="si">%s</span><span class="s2"> into Hive&quot;</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
                 <span class="bp">self</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="n">load_file</span><span class="p">(</span>
                     <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
                     <span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span><span class="p">,</span>
@@ -273,41 +328,100 @@
                     <span class="n">create</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">create</span><span class="p">,</span>
                     <span class="n">partition</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">partition</span><span class="p">,</span>
                     <span class="n">delimiter</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">,</span>
-                    <span class="n">recreate</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">recreate</span><span class="p">)</span>
+                    <span class="n">recreate</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">recreate</span><span class="p">,</span>
+                    <span class="n">tblproperties</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">tblproperties</span><span class="p">)</span>
             <span class="k">else</span><span class="p">:</span>
-                <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="s1">&#39;r&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">tmpf</span><span class="p">:</span>
-                    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">check_headers</span><span class="p">:</span>
-                        <span class="n">header_l</span> <span class="o">=</span> <span class="n">tmpf</span><span class="o">.</span><span class="n">readline</span><span class="p">()</span>
-                        <span class="n">header_line</span> <span class="o">=</span> <span class="n">header_l</span><span class="o">.</span><span class="n">rstrip</span><span class="p">()</span>
-                        <span class="n">header_list</span> <span class="o">=</span> <span class="n">header_line</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">)</span>
-                        <span class="n">field_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
-                        <span class="n">test_field_match</span> <span class="o">=</span> <span class="p">[</span><span class="n">h1</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="n">h2</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="k">for</span> <span class="n">h1</span><span class="p">,</span> <span class="n">h2</span>
-                                            <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">header_list</span><span class="p">,</span> <span class="n">field_names</span><span class="p">)]</span>
-                        <span class="k">if</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="n">test_field_match</span><span class="p">):</span>
-                            <span class="n">logging</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">&quot;Headers do not match field names&quot;</span>
-                                            <span class="s2">&quot;File headers:</span><span class="se">\n</span><span class="s2"> {header_list}</span><span class="se">\n</span><span class="s2">&quot;</span>
-                                            <span class="s2">&quot;Field names: </span><span class="se">\n</span><span class="s2"> {field_names}</span><span class="se">\n</span><span class="s2">&quot;</span>
-                                            <span class="s2">&quot;&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">**</span><span class="nb">locals</span><span class="p">()))</span>
-                            <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">&quot;Headers do not match the &quot;</span>
-                                            <span class="s2">&quot;field_dict keys&quot;</span><span class="p">)</span>
-                    <span class="k">with</span> <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f_no_headers</span><span class="p">:</span>
-                        <span class="n">tmpf</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
-                        <span class="nb">next</span><span class="p">(</span><span class="n">tmpf</span><span class="p">)</span>
-                        <span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="n">tmpf</span><span class="p">:</span>
-                            <span class="n">f_no_headers</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">line</span><span class="p">)</span>
-                        <span class="n">f_no_headers</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span>
-                        <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Loading file without headers into Hive&quot;</span><span class="p">)</span>
-                        <span class="bp">self</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="n">load_file</span><span class="p">(</span>
-                            <span class="n">f_no_headers</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
-                            <span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span><span class="p">,</span>
-                            <span class="n">field_dict</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="p">,</span>
-                            <span class="n">create</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">create</span><span class="p">,</span>
-                            <span class="n">partition</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">partition</span><span class="p">,</span>
-                            <span class="n">delimiter</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">,</span>
-                            <span class="n">recreate</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">recreate</span><span class="p">)</span></div>
+                <span class="c1"># Decompressing file</span>
+                <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_compressed</span><span class="p">:</span>
+                    <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Uncompressing file </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
+                    <span class="n">fn_uncompressed</span> <span class="o">=</span> <span class="n">uncompress_file</span><span class="p">(</span><span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
+                                                      <span class="n">file_ext</span><span class="p">,</span>
+                                                      <span class="n">tmp_dir</span><span class="p">)</span>
+                    <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Uncompressed to </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">fn_uncompressed</span><span class="p">)</span>
+                    <span class="c1"># uncompressed file available now so deleting</span>
+                    <span class="c1"># compressed file to save disk space</span>
+                    <span class="n">f</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
+                <span class="k">else</span><span class="p">:</span>
+                    <span class="n">fn_uncompressed</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span>
+
+                <span class="c1"># Testing if header matches field_dict</span>
+                <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">check_headers</span><span class="p">:</span>
+                    <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Matching file header against field_dict&quot;</span><span class="p">)</span>
+                    <span class="n">header_list</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_top_row_as_list</span><span class="p">(</span><span class="n">fn_uncompressed</span><span class="p">)</span>
+                    <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_match_headers</span><span class="p">(</span><span class="n">header_list</span><span class="p">):</span>
+                        <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">&quot;Header check failed&quot;</span><span class="p">)</span>
+
+                <span class="c1"># Deleting top header row</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Removing header from file </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">fn_uncompressed</span><span class="p">)</span>
+                <span class="n">headless_file</span> <span class="o">=</span> <span class="p">(</span>
+                    <span class="bp">self</span><span class="o">.</span><span class="n">_delete_top_row_and_compress</span><span class="p">(</span><span class="n">fn_uncompressed</span><span class="p">,</span>
+                                                      <span class="n">file_ext</span><span class="p">,</span>
+                                                      <span class="n">tmp_dir</span><span class="p">))</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Headless file </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">headless_file</span><span class="p">)</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Loading file </span><span class="si">%s</span><span class="s2"> into Hive&quot;</span><span class="p">,</span> <span class="n">headless_file</span><span class="p">)</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="n">load_file</span><span class="p">(</span><span class="n">headless_file</span><span class="p">,</span>
+                                    <span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span><span class="p">,</span>
+                                    <span class="n">field_dict</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="p">,</span>
+                                    <span class="n">create</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">create</span><span class="p">,</span>
+                                    <span class="n">partition</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">partition</span><span class="p">,</span>
+                                    <span class="n">delimiter</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">,</span>
+                                    <span class="n">recreate</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">recreate</span><span class="p">,</span>
+                                    <span class="n">tblproperties</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">tblproperties</span><span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">_get_top_row_as_list</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_name</span><span class="p">):</span>
+        <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">file_name</span><span class="p">,</span> <span class="s1">&#39;rt&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+            <span class="n">header_line</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">readline</span><span class="p">()</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
+            <span class="n">header_list</span> <span class="o">=</span> <span class="n">header_line</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">)</span>
+            <span class="k">return</span> <span class="n">header_list</span>
+
+    <span class="k">def</span> <span class="nf">_match_headers</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">header_list</span><span class="p">):</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="n">header_list</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">&quot;Unable to retrieve header row from file&quot;</span><span class="p">)</span>
+        <span class="n">field_names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="o">.</span><span class="n">keys</span><span class="p">()</span>
+        <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">field_names</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">header_list</span><span class="p">):</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">&quot;Headers count mismatch&quot;</span>
+                              <span class="s2">&quot;File headers:</span><span class="se">\n</span><span class="s2"> </span><span class="si">{header_list}</span><span class="se">\n</span><span class="s2">&quot;</span>
+                              <span class="s2">&quot;Field names: </span><span class="se">\n</span><span class="s2"> </span><span class="si">{field_names}</span><span class="se">\n</span><span class="s2">&quot;</span>
+                              <span class="s2">&quot;&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">**</span><span class="nb">locals</span><span class="p">()))</span>
+            <span class="k">return</span> <span class="kc">False</span>
+        <span class="n">test_field_match</span> <span class="o">=</span> <span class="p">[</span><span class="n">h1</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="n">h2</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span>
+                            <span class="k">for</span> <span class="n">h1</span><span class="p">,</span> <span class="n">h2</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">header_list</span><span class="p">,</span> <span class="n">field_names</span><span class="p">)]</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="n">test_field_match</span><span class="p">):</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">&quot;Headers do not match field names&quot;</span>
+                              <span class="s2">&quot;File headers:</span><span class="se">\n</span><span class="s2"> </span><span class="si">{header_list}</span><span class="se">\n</span><span class="s2">&quot;</span>
+                              <span class="s2">&quot;Field names: </span><span class="se">\n</span><span class="s2"> </span><span class="si">{field_names}</span><span class="se">\n</span><span class="s2">&quot;</span>
+                              <span class="s2">&quot;&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">**</span><span class="nb">locals</span><span class="p">()))</span>
+            <span class="k">return</span> <span class="kc">False</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="k">return</span> <span class="kc">True</span>
+
+    <span class="k">def</span> <span class="nf">_delete_top_row_and_compress</span><span class="p">(</span>
+            <span class="bp">self</span><span class="p">,</span>
+            <span class="n">input_file_name</span><span class="p">,</span>
+            <span class="n">output_file_ext</span><span class="p">,</span>
+            <span class="n">dest_dir</span><span class="p">):</span>
+        <span class="c1"># When output_file_ext is not defined, file is not compressed</span>
+        <span class="n">open_fn</span> <span class="o">=</span> <span class="nb">open</span>
+        <span class="k">if</span> <span class="n">output_file_ext</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="s1">&#39;.gz&#39;</span><span class="p">:</span>
+            <span class="n">open_fn</span> <span class="o">=</span> <span class="n">gzip</span><span class="o">.</span><span class="n">GzipFile</span>
+        <span class="k">elif</span> <span class="n">output_file_ext</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="s1">&#39;.bz2&#39;</span><span class="p">:</span>
+            <span class="n">open_fn</span> <span class="o">=</span> <span class="n">bz2</span><span class="o">.</span><span class="n">BZ2File</span>
+
+        <span class="n">os_fh_output</span><span class="p">,</span> <span class="n">fn_output</span> <span class="o">=</span> \
+            <span class="n">tempfile</span><span class="o">.</span><span class="n">mkstemp</span><span class="p">(</span><span class="n">suffix</span><span class="o">=</span><span class="n">output_file_ext</span><span class="p">,</span> <span class="nb">dir</span><span class="o">=</span><span class="n">dest_dir</span><span class="p">)</span>
+        <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">input_file_name</span><span class="p">,</span> <span class="s1">&#39;rb&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f_in</span><span class="p">,</span>\
+                <span class="n">open_fn</span><span class="p">(</span><span class="n">fn_output</span><span class="p">,</span> <span class="s1">&#39;wb&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f_out</span><span class="p">:</span>
+            <span class="n">f_in</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
+            <span class="nb">next</span><span class="p">(</span><span class="n">f_in</span><span class="p">)</span>
+            <span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="n">f_in</span><span class="p">:</span>
+                <span class="n">f_out</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">line</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">fn_output</span></div>
 </pre></div>
 
            </div>
+           <div class="articleComments">
+            
+           </div>
           </div>
           <footer>
   
@@ -340,7 +454,8 @@
             VERSION:'',
             COLLAPSE_INDEX:false,
             FILE_SUFFIX:'.html',
-            HAS_SOURCE:  true
+            HAS_SOURCE:  true,
+            SOURCELINK_SUFFIX: '.txt'
         };
     </script>
       <script type="text/javascript" src="../_static/jquery.js"></script>