You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by ka...@apache.org on 2018/08/27 16:26:27 UTC
[07/51] [partial] incubator-airflow-site git commit: 1.10.0
http://git-wip-us.apache.org/repos/asf/incubator-airflow-site/blob/11437c14/_modules/airflow/operators/s3_file_transform_operator.html
----------------------------------------------------------------------
diff --git a/_modules/airflow/operators/s3_file_transform_operator.html b/_modules/airflow/operators/s3_file_transform_operator.html
index 8db7bc2..366400e 100644
--- a/_modules/airflow/operators/s3_file_transform_operator.html
+++ b/_modules/airflow/operators/s3_file_transform_operator.html
@@ -91,7 +91,7 @@
<li class="toctree-l1"><a class="reference internal" href="../../../start.html">Quick Start</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../tutorial.html">Tutorial</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../configuration.html">Configuration</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../howto/index.html">How-to Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../ui.html">UI / Screenshots</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../concepts.html">Concepts</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../profiling.html">Data Profiling</a></li>
@@ -99,8 +99,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../scheduler.html">Scheduling & Triggers</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../plugins.html">Plugins</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../security.html">Security</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../timezone.html">Time zones</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../api.html">Experimental Rest API</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../integration.html">Integration</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../lineage.html">Lineage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../faq.html">FAQ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../code.html">API Reference</a></li>
</ul>
@@ -169,17 +171,22 @@
<h1>Source code for airflow.operators.s3_file_transform_operator</h1><div class="highlight"><pre>
<span></span><span class="c1"># -*- coding: utf-8 -*-</span>
<span class="c1">#</span>
-<span class="c1"># Licensed under the Apache License, Version 2.0 (the "License");</span>
-<span class="c1"># you may not use this file except in compliance with the License.</span>
-<span class="c1"># You may obtain a copy of the License at</span>
-<span class="c1">#</span>
-<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
-<span class="c1">#</span>
-<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
-<span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span>
-<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
-<span class="c1"># See the License for the specific language governing permissions and</span>
-<span class="c1"># limitations under the License.</span>
+<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one</span>
+<span class="c1"># or more contributor license agreements. See the NOTICE file</span>
+<span class="c1"># distributed with this work for additional information</span>
+<span class="c1"># regarding copyright ownership. The ASF licenses this file</span>
+<span class="c1"># to you under the Apache License, Version 2.0 (the</span>
+<span class="c1"># "License"); you may not use this file except in compliance</span>
+<span class="c1"># with the License. You may obtain a copy of the License at</span>
+<span class="c1"># </span>
+<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
+<span class="c1"># </span>
+<span class="c1"># Unless required by applicable law or agreed to in writing,</span>
+<span class="c1"># software distributed under the License is distributed on an</span>
+<span class="c1"># "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY</span>
+<span class="c1"># KIND, either express or implied. See the License for the</span>
+<span class="c1"># specific language governing permissions and limitations</span>
+<span class="c1"># under the License.</span>
<span class="kn">from</span> <span class="nn">tempfile</span> <span class="k">import</span> <span class="n">NamedTemporaryFile</span>
<span class="kn">import</span> <span class="nn">subprocess</span>
@@ -200,10 +207,13 @@
<span class="sd"> The locations of the source and the destination files in the local</span>
<span class="sd"> filesystem is provided as an first and second arguments to the</span>
<span class="sd"> transformation script. The transformation script is expected to read the</span>
-<span class="sd"> data from source , transform it and write the output to the local</span>
+<span class="sd"> data from source, transform it and write the output to the local</span>
<span class="sd"> destination file. The operator then takes over control and uploads the</span>
<span class="sd"> local destination file to S3.</span>
+<span class="sd"> S3 Select is also available to filter the source contents. Users can</span>
+<span class="sd"> omit the transformation script if S3 Select expression is specified.</span>
+
<span class="sd"> :param source_s3_key: The key to be retrieved from S3</span>
<span class="sd"> :type source_s3_key: str</span>
<span class="sd"> :param source_aws_conn_id: source s3 connection</span>
@@ -216,6 +226,8 @@
<span class="sd"> :type replace: bool</span>
<span class="sd"> :param transform_script: location of the executable transformation script</span>
<span class="sd"> :type transform_script: str</span>
+<span class="sd"> :param select_expression: S3 Select expression</span>
+<span class="sd"> :type select_expression: str</span>
<span class="sd"> """</span>
<span class="n">template_fields</span> <span class="o">=</span> <span class="p">(</span><span class="s1">'source_s3_key'</span><span class="p">,</span> <span class="s1">'dest_s3_key'</span><span class="p">)</span>
@@ -227,7 +239,8 @@
<span class="bp">self</span><span class="p">,</span>
<span class="n">source_s3_key</span><span class="p">,</span>
<span class="n">dest_s3_key</span><span class="p">,</span>
- <span class="n">transform_script</span><span class="p">,</span>
+ <span class="n">transform_script</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+ <span class="n">select_expression</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">source_aws_conn_id</span><span class="o">=</span><span class="s1">'aws_default'</span><span class="p">,</span>
<span class="n">dest_aws_conn_id</span><span class="o">=</span><span class="s1">'aws_default'</span><span class="p">,</span>
<span class="n">replace</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
@@ -239,34 +252,54 @@
<span class="bp">self</span><span class="o">.</span><span class="n">dest_aws_conn_id</span> <span class="o">=</span> <span class="n">dest_aws_conn_id</span>
<span class="bp">self</span><span class="o">.</span><span class="n">replace</span> <span class="o">=</span> <span class="n">replace</span>
<span class="bp">self</span><span class="o">.</span><span class="n">transform_script</span> <span class="o">=</span> <span class="n">transform_script</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">select_expression</span> <span class="o">=</span> <span class="n">select_expression</span>
<span class="k">def</span> <span class="nf">execute</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">context</span><span class="p">):</span>
+ <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">transform_script</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">select_expression</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+ <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span>
+ <span class="s2">"Either transform_script or select_expression must be specified"</span><span class="p">)</span>
+
<span class="n">source_s3</span> <span class="o">=</span> <span class="n">S3Hook</span><span class="p">(</span><span class="n">aws_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">source_aws_conn_id</span><span class="p">)</span>
<span class="n">dest_s3</span> <span class="o">=</span> <span class="n">S3Hook</span><span class="p">(</span><span class="n">aws_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">dest_aws_conn_id</span><span class="p">)</span>
+
<span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Downloading source S3 file </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">source_s3_key</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">source_s3</span><span class="o">.</span><span class="n">check_for_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">source_s3_key</span><span class="p">):</span>
- <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"The source key </span><span class="si">{0}</span><span class="s2"> does not exist"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">source_s3_key</span><span class="p">))</span>
+ <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span>
+ <span class="s2">"The source key </span><span class="si">{0}</span><span class="s2"> does not exist"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">source_s3_key</span><span class="p">))</span>
<span class="n">source_s3_key_object</span> <span class="o">=</span> <span class="n">source_s3</span><span class="o">.</span><span class="n">get_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">source_s3_key</span><span class="p">)</span>
- <span class="k">with</span> <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="s2">"w"</span><span class="p">)</span> <span class="k">as</span> <span class="n">f_source</span><span class="p">,</span> <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="s2">"w"</span><span class="p">)</span> <span class="k">as</span> <span class="n">f_dest</span><span class="p">:</span>
+
+ <span class="k">with</span> <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="s2">"wb"</span><span class="p">)</span> <span class="k">as</span> <span class="n">f_source</span><span class="p">,</span> <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="s2">"wb"</span><span class="p">)</span> <span class="k">as</span> <span class="n">f_dest</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
<span class="s2">"Dumping S3 file </span><span class="si">%s</span><span class="s2"> contents to local file </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">source_s3_key</span><span class="p">,</span> <span class="n">f_source</span><span class="o">.</span><span class="n">name</span>
<span class="p">)</span>
- <span class="n">source_s3_key_object</span><span class="o">.</span><span class="n">get_contents_to_file</span><span class="p">(</span><span class="n">f_source</span><span class="p">)</span>
- <span class="n">f_source</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span>
- <span class="n">source_s3</span><span class="o">.</span><span class="n">connection</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
- <span class="n">transform_script_process</span> <span class="o">=</span> <span class="n">subprocess</span><span class="o">.</span><span class="n">Popen</span><span class="p">(</span>
- <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">transform_script</span><span class="p">,</span> <span class="n">f_source</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">f_dest</span><span class="o">.</span><span class="n">name</span><span class="p">],</span>
- <span class="n">stdout</span><span class="o">=</span><span class="n">subprocess</span><span class="o">.</span><span class="n">PIPE</span><span class="p">,</span> <span class="n">stderr</span><span class="o">=</span><span class="n">subprocess</span><span class="o">.</span><span class="n">PIPE</span><span class="p">)</span>
- <span class="p">(</span><span class="n">transform_script_stdoutdata</span><span class="p">,</span> <span class="n">transform_script_stderrdata</span><span class="p">)</span> <span class="o">=</span> <span class="n">transform_script_process</span><span class="o">.</span><span class="n">communicate</span><span class="p">()</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Transform script stdout </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span> <span class="n">transform_script_stdoutdata</span><span class="p">)</span>
- <span class="k">if</span> <span class="n">transform_script_process</span><span class="o">.</span><span class="n">returncode</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
- <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"Transform script failed </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span> <span class="n">transform_script_stderrdata</span><span class="p">)</span>
- <span class="k">else</span><span class="p">:</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
- <span class="s2">"Transform script successful. Output temporarily located at </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span>
- <span class="n">f_dest</span><span class="o">.</span><span class="n">name</span>
+
+ <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">select_expression</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+ <span class="n">content</span> <span class="o">=</span> <span class="n">source_s3</span><span class="o">.</span><span class="n">select_key</span><span class="p">(</span>
+ <span class="n">key</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">source_s3_key</span><span class="p">,</span>
+ <span class="n">expression</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">select_expression</span>
<span class="p">)</span>
+ <span class="n">f_source</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">content</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">))</span>
+ <span class="k">else</span><span class="p">:</span>
+ <span class="n">source_s3_key_object</span><span class="o">.</span><span class="n">download_fileobj</span><span class="p">(</span><span class="n">Fileobj</span><span class="o">=</span><span class="n">f_source</span><span class="p">)</span>
+ <span class="n">f_source</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span>
+
+ <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">transform_script</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+ <span class="n">transform_script_process</span> <span class="o">=</span> <span class="n">subprocess</span><span class="o">.</span><span class="n">Popen</span><span class="p">(</span>
+ <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">transform_script</span><span class="p">,</span> <span class="n">f_source</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">f_dest</span><span class="o">.</span><span class="n">name</span><span class="p">],</span>
+ <span class="n">stdout</span><span class="o">=</span><span class="n">subprocess</span><span class="o">.</span><span class="n">PIPE</span><span class="p">,</span> <span class="n">stderr</span><span class="o">=</span><span class="n">subprocess</span><span class="o">.</span><span class="n">PIPE</span><span class="p">,</span> <span class="n">close_fds</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+ <span class="p">(</span><span class="n">transform_script_stdoutdata</span><span class="p">,</span> <span class="n">transform_script_stderrdata</span><span class="p">)</span> <span class="o">=</span> \
+ <span class="n">transform_script_process</span><span class="o">.</span><span class="n">communicate</span><span class="p">()</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Transform script stdout </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span> <span class="n">transform_script_stdoutdata</span><span class="p">)</span>
+ <span class="k">if</span> <span class="n">transform_script_process</span><span class="o">.</span><span class="n">returncode</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
+ <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span>
+ <span class="s2">"Transform script failed </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span> <span class="n">transform_script_stderrdata</span><span class="p">)</span>
+ <span class="k">else</span><span class="p">:</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
+ <span class="s2">"Transform script successful. Output temporarily located at </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span>
+ <span class="n">f_dest</span><span class="o">.</span><span class="n">name</span>
+ <span class="p">)</span>
+
<span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Uploading transformed file to S3"</span><span class="p">)</span>
<span class="n">f_dest</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span>
<span class="n">dest_s3</span><span class="o">.</span><span class="n">load_file</span><span class="p">(</span>
@@ -274,8 +307,7 @@
<span class="n">key</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">dest_s3_key</span><span class="p">,</span>
<span class="n">replace</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">replace</span>
<span class="p">)</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Upload successful"</span><span class="p">)</span>
- <span class="n">dest_s3</span><span class="o">.</span><span class="n">connection</span><span class="o">.</span><span class="n">close</span><span class="p">()</span></div>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Upload successful"</span><span class="p">)</span></div>
</pre></div>
</div>
http://git-wip-us.apache.org/repos/asf/incubator-airflow-site/blob/11437c14/_modules/airflow/operators/s3_to_hive_operator.html
----------------------------------------------------------------------
diff --git a/_modules/airflow/operators/s3_to_hive_operator.html b/_modules/airflow/operators/s3_to_hive_operator.html
deleted file mode 100644
index be79923..0000000
--- a/_modules/airflow/operators/s3_to_hive_operator.html
+++ /dev/null
@@ -1,482 +0,0 @@
-
-
-<!DOCTYPE html>
-<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
-<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
-<head>
- <meta charset="utf-8">
-
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
-
- <title>airflow.operators.s3_to_hive_operator — Airflow Documentation</title>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
-
-
-
-
-
- <link rel="index" title="Index"
- href="../../../genindex.html"/>
- <link rel="search" title="Search" href="../../../search.html"/>
- <link rel="top" title="Airflow Documentation" href="../../../index.html"/>
- <link rel="up" title="Module code" href="../../index.html"/>
-
-
- <script src="../../../_static/js/modernizr.min.js"></script>
-
-</head>
-
-<body class="wy-body-for-nav" role="document">
-
-
- <div class="wy-grid-for-nav">
-
-
- <nav data-toggle="wy-nav-shift" class="wy-nav-side">
- <div class="wy-side-scroll">
- <div class="wy-side-nav-search">
-
-
-
- <a href="../../../index.html" class="icon icon-home"> Airflow
-
-
-
- </a>
-
-
-
-
-
-
-
-<div role="search">
- <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
- <input type="text" name="q" placeholder="Search docs" />
- <input type="hidden" name="check_keywords" value="yes" />
- <input type="hidden" name="area" value="default" />
- </form>
-</div>
-
-
- </div>
-
- <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
-
-
-
-
-
-
- <ul>
-<li class="toctree-l1"><a class="reference internal" href="../../../project.html">Project</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../license.html">License</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../start.html">Quick Start</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../tutorial.html">Tutorial</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../configuration.html">Configuration</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../ui.html">UI / Screenshots</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../concepts.html">Concepts</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../profiling.html">Data Profiling</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../cli.html">Command Line Interface</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../scheduler.html">Scheduling & Triggers</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../plugins.html">Plugins</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../security.html">Security</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../api.html">Experimental Rest API</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../integration.html">Integration</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../faq.html">FAQ</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../code.html">API Reference</a></li>
-</ul>
-
-
-
- </div>
- </div>
- </nav>
-
- <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
-
-
- <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
-
- <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
- <a href="../../../index.html">Airflow</a>
-
- </nav>
-
-
-
- <div class="wy-nav-content">
- <div class="rst-content">
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-<div role="navigation" aria-label="breadcrumbs navigation">
-
- <ul class="wy-breadcrumbs">
-
- <li><a href="../../../index.html">Docs</a> »</li>
-
- <li><a href="../../index.html">Module code</a> »</li>
-
- <li>airflow.operators.s3_to_hive_operator</li>
-
-
- <li class="wy-breadcrumbs-aside">
-
-
-
- </li>
-
- </ul>
-
-
- <hr/>
-</div>
- <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
- <div itemprop="articleBody">
-
- <h1>Source code for airflow.operators.s3_to_hive_operator</h1><div class="highlight"><pre>
-<span></span><span class="c1"># -*- coding: utf-8 -*-</span>
-<span class="c1">#</span>
-<span class="c1"># Licensed under the Apache License, Version 2.0 (the "License");</span>
-<span class="c1"># you may not use this file except in compliance with the License.</span>
-<span class="c1"># You may obtain a copy of the License at</span>
-<span class="c1">#</span>
-<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
-<span class="c1">#</span>
-<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
-<span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span>
-<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
-<span class="c1"># See the License for the specific language governing permissions and</span>
-<span class="c1"># limitations under the License.</span>
-
-<span class="kn">from</span> <span class="nn">builtins</span> <span class="k">import</span> <span class="nb">next</span>
-<span class="kn">from</span> <span class="nn">builtins</span> <span class="k">import</span> <span class="nb">zip</span>
-<span class="kn">from</span> <span class="nn">tempfile</span> <span class="k">import</span> <span class="n">NamedTemporaryFile</span>
-<span class="kn">from</span> <span class="nn">airflow.utils.file</span> <span class="k">import</span> <span class="n">TemporaryDirectory</span>
-<span class="kn">import</span> <span class="nn">gzip</span>
-<span class="kn">import</span> <span class="nn">bz2</span>
-<span class="kn">import</span> <span class="nn">tempfile</span>
-<span class="kn">import</span> <span class="nn">os</span>
-
-<span class="kn">from</span> <span class="nn">airflow.exceptions</span> <span class="k">import</span> <span class="n">AirflowException</span>
-<span class="kn">from</span> <span class="nn">airflow.hooks.S3_hook</span> <span class="k">import</span> <span class="n">S3Hook</span>
-<span class="kn">from</span> <span class="nn">airflow.hooks.hive_hooks</span> <span class="k">import</span> <span class="n">HiveCliHook</span>
-<span class="kn">from</span> <span class="nn">airflow.models</span> <span class="k">import</span> <span class="n">BaseOperator</span>
-<span class="kn">from</span> <span class="nn">airflow.utils.decorators</span> <span class="k">import</span> <span class="n">apply_defaults</span>
-<span class="kn">from</span> <span class="nn">airflow.utils.compression</span> <span class="k">import</span> <span class="n">uncompress_file</span>
-
-
-<div class="viewcode-block" id="S3ToHiveTransfer"><a class="viewcode-back" href="../../../integration.html#airflow.operators.s3_to_hive_operator.S3ToHiveTransfer">[docs]</a><span class="k">class</span> <span class="nc">S3ToHiveTransfer</span><span class="p">(</span><span class="n">BaseOperator</span><span class="p">):</span>
- <span class="sd">"""</span>
-<span class="sd"> Moves data from S3 to Hive. The operator downloads a file from S3,</span>
-<span class="sd"> stores the file locally before loading it into a Hive table.</span>
-<span class="sd"> If the ``create`` or ``recreate`` arguments are set to ``True``,</span>
-<span class="sd"> a ``CREATE TABLE`` and ``DROP TABLE`` statements are generated.</span>
-<span class="sd"> Hive data types are inferred from the cursor's metadata from.</span>
-
-<span class="sd"> Note that the table generated in Hive uses ``STORED AS textfile``</span>
-<span class="sd"> which isn't the most efficient serialization format. If a</span>
-<span class="sd"> large amount of data is loaded and/or if the tables gets</span>
-<span class="sd"> queried considerably, you may want to use this operator only to</span>
-<span class="sd"> stage the data into a temporary table before loading it into its</span>
-<span class="sd"> final destination using a ``HiveOperator``.</span>
-
-<span class="sd"> :param s3_key: The key to be retrieved from S3</span>
-<span class="sd"> :type s3_key: str</span>
-<span class="sd"> :param field_dict: A dictionary of the fields name in the file</span>
-<span class="sd"> as keys and their Hive types as values</span>
-<span class="sd"> :type field_dict: dict</span>
-<span class="sd"> :param hive_table: target Hive table, use dot notation to target a</span>
-<span class="sd"> specific database</span>
-<span class="sd"> :type hive_table: str</span>
-<span class="sd"> :param create: whether to create the table if it doesn't exist</span>
-<span class="sd"> :type create: bool</span>
-<span class="sd"> :param recreate: whether to drop and recreate the table at every</span>
-<span class="sd"> execution</span>
-<span class="sd"> :type recreate: bool</span>
-<span class="sd"> :param partition: target partition as a dict of partition columns</span>
-<span class="sd"> and values</span>
-<span class="sd"> :type partition: dict</span>
-<span class="sd"> :param headers: whether the file contains column names on the first</span>
-<span class="sd"> line</span>
-<span class="sd"> :type headers: bool</span>
-<span class="sd"> :param check_headers: whether the column names on the first line should be</span>
-<span class="sd"> checked against the keys of field_dict</span>
-<span class="sd"> :type check_headers: bool</span>
-<span class="sd"> :param wildcard_match: whether the s3_key should be interpreted as a Unix</span>
-<span class="sd"> wildcard pattern</span>
-<span class="sd"> :type wildcard_match: bool</span>
-<span class="sd"> :param delimiter: field delimiter in the file</span>
-<span class="sd"> :type delimiter: str</span>
-<span class="sd"> :param aws_conn_id: source s3 connection</span>
-<span class="sd"> :type aws_conn_id: str</span>
-<span class="sd"> :param hive_cli_conn_id: destination hive connection</span>
-<span class="sd"> :type hive_cli_conn_id: str</span>
-<span class="sd"> :param input_compressed: Boolean to determine if file decompression is</span>
-<span class="sd"> required to process headers</span>
-<span class="sd"> :type input_compressed: bool</span>
-<span class="sd"> :param tblproperties: TBLPROPERTIES of the hive table being created</span>
-<span class="sd"> :type tblproperties: dict</span>
-<span class="sd"> """</span>
-
- <span class="n">template_fields</span> <span class="o">=</span> <span class="p">(</span><span class="s1">'s3_key'</span><span class="p">,</span> <span class="s1">'partition'</span><span class="p">,</span> <span class="s1">'hive_table'</span><span class="p">)</span>
- <span class="n">template_ext</span> <span class="o">=</span> <span class="p">()</span>
- <span class="n">ui_color</span> <span class="o">=</span> <span class="s1">'#a0e08c'</span>
-
- <span class="nd">@apply_defaults</span>
- <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span>
- <span class="bp">self</span><span class="p">,</span>
- <span class="n">s3_key</span><span class="p">,</span>
- <span class="n">field_dict</span><span class="p">,</span>
- <span class="n">hive_table</span><span class="p">,</span>
- <span class="n">delimiter</span><span class="o">=</span><span class="s1">','</span><span class="p">,</span>
- <span class="n">create</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
- <span class="n">recreate</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
- <span class="n">partition</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
- <span class="n">headers</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
- <span class="n">check_headers</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
- <span class="n">wildcard_match</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
- <span class="n">aws_conn_id</span><span class="o">=</span><span class="s1">'aws_default'</span><span class="p">,</span>
- <span class="n">hive_cli_conn_id</span><span class="o">=</span><span class="s1">'hive_cli_default'</span><span class="p">,</span>
- <span class="n">input_compressed</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
- <span class="n">tblproperties</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
- <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
- <span class="nb">super</span><span class="p">(</span><span class="n">S3ToHiveTransfer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span> <span class="o">=</span> <span class="n">s3_key</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span> <span class="o">=</span> <span class="n">field_dict</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span> <span class="o">=</span> <span class="n">hive_table</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span> <span class="o">=</span> <span class="n">delimiter</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">create</span> <span class="o">=</span> <span class="n">create</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">recreate</span> <span class="o">=</span> <span class="n">recreate</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">partition</span> <span class="o">=</span> <span class="n">partition</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">headers</span> <span class="o">=</span> <span class="n">headers</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">check_headers</span> <span class="o">=</span> <span class="n">check_headers</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">wildcard_match</span> <span class="o">=</span> <span class="n">wildcard_match</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">hive_cli_conn_id</span> <span class="o">=</span> <span class="n">hive_cli_conn_id</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">aws_conn_id</span> <span class="o">=</span> <span class="n">aws_conn_id</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">input_compressed</span> <span class="o">=</span> <span class="n">input_compressed</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">tblproperties</span> <span class="o">=</span> <span class="n">tblproperties</span>
-
- <span class="k">if</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">check_headers</span> <span class="ow">and</span>
- <span class="ow">not</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">headers</span><span class="p">)):</span>
- <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"To check_headers provide "</span> <span class="o">+</span>
- <span class="s2">"field_dict and headers"</span><span class="p">)</span>
-
- <span class="k">def</span> <span class="nf">execute</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">context</span><span class="p">):</span>
- <span class="c1"># Downloading file from S3</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">s3</span> <span class="o">=</span> <span class="n">S3Hook</span><span class="p">(</span><span class="n">aws_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">aws_conn_id</span><span class="p">)</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">hive</span> <span class="o">=</span> <span class="n">HiveCliHook</span><span class="p">(</span><span class="n">hive_cli_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">hive_cli_conn_id</span><span class="p">)</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Downloading S3 file"</span><span class="p">)</span>
-
- <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">wildcard_match</span><span class="p">:</span>
- <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">check_for_wildcard_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">):</span>
- <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"No key matches </span><span class="si">{0}</span><span class="s2">"</span>
- <span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">))</span>
- <span class="n">s3_key_object</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">get_wildcard_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">)</span>
- <span class="k">else</span><span class="p">:</span>
- <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">check_for_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">):</span>
- <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span>
- <span class="s2">"The key </span><span class="si">{0}</span><span class="s2"> does not exists"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">))</span>
- <span class="n">s3_key_object</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">get_key</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">)</span>
- <span class="n">root</span><span class="p">,</span> <span class="n">file_ext</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">splitext</span><span class="p">(</span><span class="n">s3_key_object</span><span class="o">.</span><span class="n">key</span><span class="p">)</span>
- <span class="k">with</span> <span class="n">TemporaryDirectory</span><span class="p">(</span><span class="n">prefix</span><span class="o">=</span><span class="s1">'tmps32hive_'</span><span class="p">)</span> <span class="k">as</span> <span class="n">tmp_dir</span><span class="p">,</span>\
- <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"wb"</span><span class="p">,</span>
- <span class="nb">dir</span><span class="o">=</span><span class="n">tmp_dir</span><span class="p">,</span>
- <span class="n">suffix</span><span class="o">=</span><span class="n">file_ext</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Dumping S3 key </span><span class="si">{0}</span><span class="s2"> contents to local file </span><span class="si">{1}</span><span class="s2">"</span>
- <span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">s3_key_object</span><span class="o">.</span><span class="n">key</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">))</span>
- <span class="n">s3_key_object</span><span class="o">.</span><span class="n">download_fileobj</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
- <span class="n">f</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span>
- <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">headers</span><span class="p">:</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Loading file </span><span class="si">%s</span><span class="s2"> into Hive"</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="n">load_file</span><span class="p">(</span>
- <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span><span class="p">,</span>
- <span class="n">field_dict</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="p">,</span>
- <span class="n">create</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">create</span><span class="p">,</span>
- <span class="n">partition</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">partition</span><span class="p">,</span>
- <span class="n">delimiter</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">,</span>
- <span class="n">recreate</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">recreate</span><span class="p">,</span>
- <span class="n">tblproperties</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">tblproperties</span><span class="p">)</span>
- <span class="k">else</span><span class="p">:</span>
- <span class="c1"># Decompressing file</span>
- <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_compressed</span><span class="p">:</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Uncompressing file </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
- <span class="n">fn_uncompressed</span> <span class="o">=</span> <span class="n">uncompress_file</span><span class="p">(</span><span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
- <span class="n">file_ext</span><span class="p">,</span>
- <span class="n">tmp_dir</span><span class="p">)</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Uncompressed to </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span> <span class="n">fn_uncompressed</span><span class="p">)</span>
- <span class="c1"># uncompressed file available now so deleting</span>
- <span class="c1"># compressed file to save disk space</span>
- <span class="n">f</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
- <span class="k">else</span><span class="p">:</span>
- <span class="n">fn_uncompressed</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">name</span>
-
- <span class="c1"># Testing if header matches field_dict</span>
- <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">check_headers</span><span class="p">:</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Matching file header against field_dict"</span><span class="p">)</span>
- <span class="n">header_list</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_top_row_as_list</span><span class="p">(</span><span class="n">fn_uncompressed</span><span class="p">)</span>
- <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_match_headers</span><span class="p">(</span><span class="n">header_list</span><span class="p">):</span>
- <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"Header check failed"</span><span class="p">)</span>
-
- <span class="c1"># Deleting top header row</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Removing header from file </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span> <span class="n">fn_uncompressed</span><span class="p">)</span>
- <span class="n">headless_file</span> <span class="o">=</span> <span class="p">(</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">_delete_top_row_and_compress</span><span class="p">(</span><span class="n">fn_uncompressed</span><span class="p">,</span>
- <span class="n">file_ext</span><span class="p">,</span>
- <span class="n">tmp_dir</span><span class="p">))</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Headless file </span><span class="si">%s</span><span class="s2">"</span><span class="p">,</span> <span class="n">headless_file</span><span class="p">)</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Loading file </span><span class="si">%s</span><span class="s2"> into Hive"</span><span class="p">,</span> <span class="n">headless_file</span><span class="p">)</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="n">load_file</span><span class="p">(</span><span class="n">headless_file</span><span class="p">,</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">hive_table</span><span class="p">,</span>
- <span class="n">field_dict</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="p">,</span>
- <span class="n">create</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">create</span><span class="p">,</span>
- <span class="n">partition</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">partition</span><span class="p">,</span>
- <span class="n">delimiter</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">,</span>
- <span class="n">recreate</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">recreate</span><span class="p">,</span>
- <span class="n">tblproperties</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">tblproperties</span><span class="p">)</span>
-
- <span class="k">def</span> <span class="nf">_get_top_row_as_list</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_name</span><span class="p">):</span>
- <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">file_name</span><span class="p">,</span> <span class="s1">'rt'</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
- <span class="n">header_line</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">readline</span><span class="p">()</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
- <span class="n">header_list</span> <span class="o">=</span> <span class="n">header_line</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">delimiter</span><span class="p">)</span>
- <span class="k">return</span> <span class="n">header_list</span>
-
- <span class="k">def</span> <span class="nf">_match_headers</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">header_list</span><span class="p">):</span>
- <span class="k">if</span> <span class="ow">not</span> <span class="n">header_list</span><span class="p">:</span>
- <span class="k">raise</span> <span class="n">AirflowException</span><span class="p">(</span><span class="s2">"Unable to retrieve header row from file"</span><span class="p">)</span>
- <span class="n">field_names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">field_dict</span><span class="o">.</span><span class="n">keys</span><span class="p">()</span>
- <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">field_names</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">header_list</span><span class="p">):</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">"Headers count mismatch"</span>
- <span class="s2">"File headers:</span><span class="se">\n</span><span class="s2"> </span><span class="si">{header_list}</span><span class="se">\n</span><span class="s2">"</span>
- <span class="s2">"Field names: </span><span class="se">\n</span><span class="s2"> </span><span class="si">{field_names}</span><span class="se">\n</span><span class="s2">"</span>
- <span class="s2">""</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">**</span><span class="nb">locals</span><span class="p">()))</span>
- <span class="k">return</span> <span class="kc">False</span>
- <span class="n">test_field_match</span> <span class="o">=</span> <span class="p">[</span><span class="n">h1</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="n">h2</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span>
- <span class="k">for</span> <span class="n">h1</span><span class="p">,</span> <span class="n">h2</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">header_list</span><span class="p">,</span> <span class="n">field_names</span><span class="p">)]</span>
- <span class="k">if</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="n">test_field_match</span><span class="p">):</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">"Headers do not match field names"</span>
- <span class="s2">"File headers:</span><span class="se">\n</span><span class="s2"> </span><span class="si">{header_list}</span><span class="se">\n</span><span class="s2">"</span>
- <span class="s2">"Field names: </span><span class="se">\n</span><span class="s2"> </span><span class="si">{field_names}</span><span class="se">\n</span><span class="s2">"</span>
- <span class="s2">""</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">**</span><span class="nb">locals</span><span class="p">()))</span>
- <span class="k">return</span> <span class="kc">False</span>
- <span class="k">else</span><span class="p">:</span>
- <span class="k">return</span> <span class="kc">True</span>
-
- <span class="k">def</span> <span class="nf">_delete_top_row_and_compress</span><span class="p">(</span>
- <span class="bp">self</span><span class="p">,</span>
- <span class="n">input_file_name</span><span class="p">,</span>
- <span class="n">output_file_ext</span><span class="p">,</span>
- <span class="n">dest_dir</span><span class="p">):</span>
- <span class="c1"># When output_file_ext is not defined, file is not compressed</span>
- <span class="n">open_fn</span> <span class="o">=</span> <span class="nb">open</span>
- <span class="k">if</span> <span class="n">output_file_ext</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="s1">'.gz'</span><span class="p">:</span>
- <span class="n">open_fn</span> <span class="o">=</span> <span class="n">gzip</span><span class="o">.</span><span class="n">GzipFile</span>
- <span class="k">elif</span> <span class="n">output_file_ext</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="s1">'.bz2'</span><span class="p">:</span>
- <span class="n">open_fn</span> <span class="o">=</span> <span class="n">bz2</span><span class="o">.</span><span class="n">BZ2File</span>
-
- <span class="n">os_fh_output</span><span class="p">,</span> <span class="n">fn_output</span> <span class="o">=</span> \
- <span class="n">tempfile</span><span class="o">.</span><span class="n">mkstemp</span><span class="p">(</span><span class="n">suffix</span><span class="o">=</span><span class="n">output_file_ext</span><span class="p">,</span> <span class="nb">dir</span><span class="o">=</span><span class="n">dest_dir</span><span class="p">)</span>
- <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">input_file_name</span><span class="p">,</span> <span class="s1">'rb'</span><span class="p">)</span> <span class="k">as</span> <span class="n">f_in</span><span class="p">,</span>\
- <span class="n">open_fn</span><span class="p">(</span><span class="n">fn_output</span><span class="p">,</span> <span class="s1">'wb'</span><span class="p">)</span> <span class="k">as</span> <span class="n">f_out</span><span class="p">:</span>
- <span class="n">f_in</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
- <span class="nb">next</span><span class="p">(</span><span class="n">f_in</span><span class="p">)</span>
- <span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="n">f_in</span><span class="p">:</span>
- <span class="n">f_out</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">line</span><span class="p">)</span>
- <span class="k">return</span> <span class="n">fn_output</span></div>
-</pre></div>
-
- </div>
- <div class="articleComments">
-
- </div>
- </div>
- <footer>
-
-
- <hr/>
-
- <div role="contentinfo">
- <p>
-
- </p>
- </div>
- Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
-
-</footer>
-
- </div>
- </div>
-
- </section>
-
- </div>
-
-
-
-
-
- <script type="text/javascript">
- var DOCUMENTATION_OPTIONS = {
- URL_ROOT:'../../../',
- VERSION:'',
- COLLAPSE_INDEX:false,
- FILE_SUFFIX:'.html',
- HAS_SOURCE: true,
- SOURCELINK_SUFFIX: '.txt'
- };
- </script>
- <script type="text/javascript" src="../../../_static/jquery.js"></script>
- <script type="text/javascript" src="../../../_static/underscore.js"></script>
- <script type="text/javascript" src="../../../_static/doctools.js"></script>
-
-
-
-
-
- <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
-
-
-
- <script type="text/javascript">
- jQuery(function () {
- SphinxRtdTheme.StickyNav.enable();
- });
- </script>
-
-
-</body>
-</html>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-airflow-site/blob/11437c14/_modules/airflow/operators/s3_to_redshift_operator.html
----------------------------------------------------------------------
diff --git a/_modules/airflow/operators/s3_to_redshift_operator.html b/_modules/airflow/operators/s3_to_redshift_operator.html
new file mode 100644
index 0000000..e83cdfd
--- /dev/null
+++ b/_modules/airflow/operators/s3_to_redshift_operator.html
@@ -0,0 +1,330 @@
+
+
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+ <meta charset="utf-8">
+
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+ <title>airflow.operators.s3_to_redshift_operator — Airflow Documentation</title>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
+
+
+
+
+
+ <link rel="index" title="Index"
+ href="../../../genindex.html"/>
+ <link rel="search" title="Search" href="../../../search.html"/>
+ <link rel="top" title="Airflow Documentation" href="../../../index.html"/>
+ <link rel="up" title="Module code" href="../../index.html"/>
+
+
+ <script src="../../../_static/js/modernizr.min.js"></script>
+
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+
+ <div class="wy-grid-for-nav">
+
+
+ <nav data-toggle="wy-nav-shift" class="wy-nav-side">
+ <div class="wy-side-scroll">
+ <div class="wy-side-nav-search">
+
+
+
+ <a href="../../../index.html" class="icon icon-home"> Airflow
+
+
+
+ </a>
+
+
+
+
+
+
+
+<div role="search">
+ <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
+ <input type="text" name="q" placeholder="Search docs" />
+ <input type="hidden" name="check_keywords" value="yes" />
+ <input type="hidden" name="area" value="default" />
+ </form>
+</div>
+
+
+ </div>
+
+ <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
+
+
+
+
+
+
+ <ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../project.html">Project</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../license.html">License</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../start.html">Quick Start</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../tutorial.html">Tutorial</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../howto/index.html">How-to Guides</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../ui.html">UI / Screenshots</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../concepts.html">Concepts</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../profiling.html">Data Profiling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../cli.html">Command Line Interface</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../scheduler.html">Scheduling & Triggers</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../plugins.html">Plugins</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../security.html">Security</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../timezone.html">Time zones</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../api.html">Experimental Rest API</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../integration.html">Integration</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../lineage.html">Lineage</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../faq.html">FAQ</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../code.html">API Reference</a></li>
+</ul>
+
+
+
+ </div>
+ </div>
+ </nav>
+
+ <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+
+ <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
+
+ <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+ <a href="../../../index.html">Airflow</a>
+
+ </nav>
+
+
+
+ <div class="wy-nav-content">
+ <div class="rst-content">
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<div role="navigation" aria-label="breadcrumbs navigation">
+
+ <ul class="wy-breadcrumbs">
+
+ <li><a href="../../../index.html">Docs</a> »</li>
+
+ <li><a href="../../index.html">Module code</a> »</li>
+
+ <li>airflow.operators.s3_to_redshift_operator</li>
+
+
+ <li class="wy-breadcrumbs-aside">
+
+
+
+ </li>
+
+ </ul>
+
+
+ <hr/>
+</div>
+ <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+ <div itemprop="articleBody">
+
+ <h1>Source code for airflow.operators.s3_to_redshift_operator</h1><div class="highlight"><pre>
+<span></span><span class="c1"># -*- coding: utf-8 -*-</span>
+<span class="c1">#</span>
+<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one</span>
+<span class="c1"># or more contributor license agreements. See the NOTICE file</span>
+<span class="c1"># distributed with this work for additional information</span>
+<span class="c1"># regarding copyright ownership. The ASF licenses this file</span>
+<span class="c1"># to you under the Apache License, Version 2.0 (the</span>
+<span class="c1"># "License"); you may not use this file except in compliance</span>
+<span class="c1"># with the License. You may obtain a copy of the License at</span>
+<span class="c1"># </span>
+<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
+<span class="c1"># </span>
+<span class="c1"># Unless required by applicable law or agreed to in writing,</span>
+<span class="c1"># software distributed under the License is distributed on an</span>
+<span class="c1"># "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY</span>
+<span class="c1"># KIND, either express or implied. See the License for the</span>
+<span class="c1"># specific language governing permissions and limitations</span>
+<span class="c1"># under the License.</span>
+<span class="kn">from</span> <span class="nn">airflow.hooks.postgres_hook</span> <span class="k">import</span> <span class="n">PostgresHook</span>
+<span class="kn">from</span> <span class="nn">airflow.hooks.S3_hook</span> <span class="k">import</span> <span class="n">S3Hook</span>
+<span class="kn">from</span> <span class="nn">airflow.models</span> <span class="k">import</span> <span class="n">BaseOperator</span>
+<span class="kn">from</span> <span class="nn">airflow.utils.decorators</span> <span class="k">import</span> <span class="n">apply_defaults</span>
+
+
+<div class="viewcode-block" id="S3ToRedshiftTransfer"><a class="viewcode-back" href="../../../integration.html#airflow.operators.s3_to_redshift_operator.S3ToRedshiftTransfer">[docs]</a><span class="k">class</span> <span class="nc">S3ToRedshiftTransfer</span><span class="p">(</span><span class="n">BaseOperator</span><span class="p">):</span>
+ <span class="sd">"""</span>
+<span class="sd"> Executes an COPY command to load files from s3 to Redshift</span>
+
+<span class="sd"> :param schema: reference to a specific schema in redshift database</span>
+<span class="sd"> :type schema: string</span>
+<span class="sd"> :param table: reference to a specific table in redshift database</span>
+<span class="sd"> :type table: string</span>
+<span class="sd"> :param s3_bucket: reference to a specific S3 bucket</span>
+<span class="sd"> :type s3_bucket: string</span>
+<span class="sd"> :param s3_key: reference to a specific S3 key</span>
+<span class="sd"> :type s3_key: string</span>
+<span class="sd"> :param redshift_conn_id: reference to a specific redshift database</span>
+<span class="sd"> :type redshift_conn_id: string</span>
+<span class="sd"> :param aws_conn_id: reference to a specific S3 connection</span>
+<span class="sd"> :type aws_conn_id: string</span>
+<span class="sd"> :param copy_options: reference to a list of COPY options</span>
+<span class="sd"> :type copy_options: list</span>
+<span class="sd"> """</span>
+
+ <span class="n">template_fields</span> <span class="o">=</span> <span class="p">()</span>
+ <span class="n">template_ext</span> <span class="o">=</span> <span class="p">()</span>
+ <span class="n">ui_color</span> <span class="o">=</span> <span class="s1">'#ededed'</span>
+
+ <span class="nd">@apply_defaults</span>
+ <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span>
+ <span class="bp">self</span><span class="p">,</span>
+ <span class="n">schema</span><span class="p">,</span>
+ <span class="n">table</span><span class="p">,</span>
+ <span class="n">s3_bucket</span><span class="p">,</span>
+ <span class="n">s3_key</span><span class="p">,</span>
+ <span class="n">redshift_conn_id</span><span class="o">=</span><span class="s1">'redshift_default'</span><span class="p">,</span>
+ <span class="n">aws_conn_id</span><span class="o">=</span><span class="s1">'aws_default'</span><span class="p">,</span>
+ <span class="n">copy_options</span><span class="o">=</span><span class="nb">tuple</span><span class="p">(),</span>
+ <span class="n">autocommit</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+ <span class="n">parameters</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+ <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
+ <span class="nb">super</span><span class="p">(</span><span class="n">S3ToRedshiftTransfer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">schema</span> <span class="o">=</span> <span class="n">schema</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">table</span> <span class="o">=</span> <span class="n">table</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">s3_bucket</span> <span class="o">=</span> <span class="n">s3_bucket</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span> <span class="o">=</span> <span class="n">s3_key</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">redshift_conn_id</span> <span class="o">=</span> <span class="n">redshift_conn_id</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">aws_conn_id</span> <span class="o">=</span> <span class="n">aws_conn_id</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">copy_options</span> <span class="o">=</span> <span class="n">copy_options</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">autocommit</span> <span class="o">=</span> <span class="n">autocommit</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">parameters</span> <span class="o">=</span> <span class="n">parameters</span>
+
+ <span class="k">def</span> <span class="nf">execute</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">context</span><span class="p">):</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">hook</span> <span class="o">=</span> <span class="n">PostgresHook</span><span class="p">(</span><span class="n">postgres_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">redshift_conn_id</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">s3</span> <span class="o">=</span> <span class="n">S3Hook</span><span class="p">(</span><span class="n">aws_conn_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">aws_conn_id</span><span class="p">)</span>
+ <span class="n">credentials</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">s3</span><span class="o">.</span><span class="n">get_credentials</span><span class="p">()</span>
+ <span class="n">copy_options</span> <span class="o">=</span> <span class="s1">'</span><span class="se">\n\t\t\t</span><span class="s1">'</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">copy_options</span><span class="p">)</span>
+
+ <span class="n">copy_query</span> <span class="o">=</span> <span class="s2">"""</span>
+<span class="s2"> COPY </span><span class="si">{schema}</span><span class="s2">.</span><span class="si">{table}</span><span class="s2"></span>
+<span class="s2"> FROM 's3://</span><span class="si">{s3_bucket}</span><span class="s2">/</span><span class="si">{s3_key}</span><span class="s2">/</span><span class="si">{table}</span><span class="s2">'</span>
+<span class="s2"> with credentials</span>
+<span class="s2"> 'aws_access_key_id=</span><span class="si">{access_key}</span><span class="s2">;aws_secret_access_key=</span><span class="si">{secret_key}</span><span class="s2">'</span>
+<span class="s2"> </span><span class="si">{copy_options}</span><span class="s2">;</span>
+<span class="s2"> """</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">schema</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">schema</span><span class="p">,</span>
+ <span class="n">table</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">table</span><span class="p">,</span>
+ <span class="n">s3_bucket</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_bucket</span><span class="p">,</span>
+ <span class="n">s3_key</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">s3_key</span><span class="p">,</span>
+ <span class="n">access_key</span><span class="o">=</span><span class="n">credentials</span><span class="o">.</span><span class="n">access_key</span><span class="p">,</span>
+ <span class="n">secret_key</span><span class="o">=</span><span class="n">credentials</span><span class="o">.</span><span class="n">secret_key</span><span class="p">,</span>
+ <span class="n">copy_options</span><span class="o">=</span><span class="n">copy_options</span><span class="p">)</span>
+
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s1">'Executing COPY command...'</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">hook</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">copy_query</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">autocommit</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"COPY command complete..."</span><span class="p">)</span></div>
+</pre></div>
+
+ </div>
+ <div class="articleComments">
+
+ </div>
+ </div>
+ <footer>
+
+
+ <hr/>
+
+ <div role="contentinfo">
+ <p>
+
+ </p>
+ </div>
+ Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+
+</footer>
+
+ </div>
+ </div>
+
+ </section>
+
+ </div>
+
+
+
+
+
+ <script type="text/javascript">
+ var DOCUMENTATION_OPTIONS = {
+ URL_ROOT:'../../../',
+ VERSION:'',
+ COLLAPSE_INDEX:false,
+ FILE_SUFFIX:'.html',
+ HAS_SOURCE: true,
+ SOURCELINK_SUFFIX: '.txt'
+ };
+ </script>
+ <script type="text/javascript" src="../../../_static/jquery.js"></script>
+ <script type="text/javascript" src="../../../_static/underscore.js"></script>
+ <script type="text/javascript" src="../../../_static/doctools.js"></script>
+
+
+
+
+
+ <script type="text/javascript" src="../../../_static/js/theme.js"></script>
+
+
+
+
+ <script type="text/javascript">
+ jQuery(function () {
+ SphinxRtdTheme.StickyNav.enable();
+ });
+ </script>
+
+
+</body>
+</html>
\ No newline at end of file