You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by cr...@apache.org on 2016/06/01 21:57:28 UTC
incubator-airflow git commit: [AIRFLOW-155] Documentation of Qubole Operator

Repository: incubator-airflow
Updated Branches:
  refs/heads/master c2384cb41 -> dce08f68b


[AIRFLOW-155] Documentation of Qubole Operator

Dear Airflow Maintainers,

Please accept this PR that addresses the following issues:
- *https://issues.apache.org/jira/browse/AIRFLOW-155*

Thanks,
Sumit

Author: Sumit Maheshwari <su...@qubole.com>

Closes #1560 from msumit/AIRFLOW-155.


Project: http://git-wip-us.apache.org/repos/asf/incubator-airflow/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-airflow/commit/dce08f68
Tree: http://git-wip-us.apache.org/repos/asf/incubator-airflow/tree/dce08f68
Diff: http://git-wip-us.apache.org/repos/asf/incubator-airflow/diff/dce08f68

Branch: refs/heads/master
Commit: dce08f68bccac030abe4cf60c5862b6ce7767e38
Parents: c2384cb
Author: Sumit Maheshwari <su...@qubole.com>
Authored: Wed Jun 1 14:57:33 2016 -0700
Committer: Chris Riccomini <ch...@wepay.com>
Committed: Wed Jun 1 14:57:33 2016 -0700

----------------------------------------------------------------------
 README.md                                    |   1 +
 airflow/contrib/operators/qubole_operator.py | 144 ++++++++++------------
 docs/code.rst                                |   1 +
 3 files changed, 67 insertions(+), 79 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-airflow/blob/dce08f68/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index d27c826..43c8818 100644
--- a/README.md
+++ b/README.md
@@ -104,6 +104,7 @@ Currently **officially** using Airflow:
 * [Lucid](http://luc.id) [[@jbrownlucid](https://github.com/jbrownlucid) & [@kkourtchikov](https://github.com/kkourtchikov)]
 * [Lyft](https://www.lyft.com/)[[@SaurabhBajaj](https://github.com/SaurabhBajaj)]
 * [Nerdwallet](https://www.nerdwallet.com)
+* [Qubole](https://qubole.com) [[@msumit](https://github.com/msumit)]
 * [Sense360](https://github.com/Sense360) [[@kamilmroczek](https://github.com/KamilMroczek)]
 * [Sidecar](https://hello.getsidecar.com/) [[@getsidecar](https://github.com/getsidecar)]
 * [SimilarWeb](https://www.similarweb.com/) [[@similarweb](https://github.com/similarweb)]

http://git-wip-us.apache.org/repos/asf/incubator-airflow/blob/dce08f68/airflow/contrib/operators/qubole_operator.py
----------------------------------------------------------------------
diff --git a/airflow/contrib/operators/qubole_operator.py b/airflow/contrib/operators/qubole_operator.py
index 1cec673..1f09931 100755
--- a/airflow/contrib/operators/qubole_operator.py
+++ b/airflow/contrib/operators/qubole_operator.py
@@ -5,96 +5,82 @@ from airflow.contrib.hooks import QuboleHook
 
 class QuboleOperator(BaseOperator):
     """
-    Executes commands on Qubole (https://qubole.com).
+    Execute tasks (commands) on QDS (https://qubole.com).
+
+    :param qubole_conn_id: Connection id which consists of qds auth_token
+    :type qubole_conn_id: str
+
+    kwargs:
+        :command_type: type of command to be executed, e.g. hivecmd, shellcmd, hadoopcmd
+        :tags: array of tags to be assigned with the command
+        :cluster_label: cluster label on which the command will be executed
+        :name: name to be given to command
+
+        **Arguments specific to command types**
 
-    mandatory:
-        :param command_type: type of command to be executed, e.g. hivecmd, shellcmd, hadoopcmd
-    other:
         hivecmd:
-            :param query: inline query statement
-            :param script_location: s3 location containing query statement
-            :param sample_size:
-            :param macros: macro values which were used in query
-            :param tags: array of tags to be assigned with the command
-            :param cluster_label: cluster label on which to execute command
-            :param name: name to be given to command
+            :query: inline query statement
+            :script_location: s3 location containing query statement
+            :sample_size: size of sample in bytes on which to run query
+            :macros: macro values which were used in query
         prestocmd:
-            :param query: inline query statement
-            :param script_location: s3 location containing query statement
-            :param macros: macro values which were used in query
-            :param tags: array of tags to be assigned with the command
-            :param cluster_label: cluster label on which to execute command
-            :param name: name to be given to command
+            :query: inline query statement
+            :script_location: s3 location containing query statement
+            :macros: macro values which were used in query
         hadoopcmd:
-            :param sub_commnad: must be one these ["jar", "s3distcp", "streaming"] followed by 1 or more args
-            :param tags: array of tags to be assigned with the command
-            :param cluster_label: cluster label on which to execute command
-            :param name: name to be given to command
+            :sub_commnad: must be one these ["jar", "s3distcp", "streaming"] followed by 1 or more args
         shellcmd:
-            :param script: inline command with args
-            :param script_location: s3 location containing query statement
-            :param files: list of files in s3 bucket as file1,file2 format. These files will be copied into the working
-                          directory where the qubole command is being executed.
-            :param archives: list of archives in s3 bucket as archive1,archive2 format. These will be unarchived into
-                             the working directory where the qubole command is being executed
-            :param parameters: any extra args which need to be passed to script (only wwhen script_location is supplied)
-            :param tags: array of tags to be assigned with the command
-            :param cluster_label: cluster label on which to execute command
-            :param name: name to be given to command
+            :script: inline command with args
+            :script_location: s3 location containing query statement
+            :files: list of files in s3 bucket as file1,file2 format. These files will be copied into the working directory where the qubole command is being executed.
+            :archives: list of archives in s3 bucket as archive1,archive2 format. These will be unarchived intothe working directory where the qubole command is being executed
+            :parameters: any extra args which need to be passed to script (only when script_location is supplied)
         pigcmd:
-            :param script: inline query statement (latin_statements)
-            :param script_location: s3 location containing pig query
-            :param parameters: any extra args which need to be passed to script (only wwhen script_location is supplied
-            :param tags: array of tags to be assigned with the command
-            :param cluster_label: cluster label on which to execute command
-            :param name: name to be given to command
-        dbtapquerycmd:
-            :param db_tap_id: data store ID of the target database, in Qubole.
-            :param query: inline query statement
-            :param macros: macro values which were used in query
-            :param tags: array of tags to be assigned with the command
-            :param name: name to be given to command
+            :script: inline query statement (latin_statements)
+            :script_location: s3 location containing pig query
+            :parameters: any extra args which need to be passed to script (only when script_location is supplied
         sparkcmd:
-            :param program: the complete Spark Program in Scala, SQL, Command, R, or Python
-            :param cmdline: spark-submit command line, all required information must be specify in cmdline itself.
-            :param sql: inline sql query
-            :param script_location: s3 location containing query statement
-            :param language: language of the program, Scala, SQL, Command, R, or Python
-            :param app_id: ID of an Spark job server app
-            :param arguments: spark-submit command line arguments
-            :param user_program_arguments: arguments that the user program takes in
-            :param macros: macro values which were used in query
-            :param tags: array of tags to be assigned with the command
-            :param cluster_label: cluster label on which to execute command
-            :param name: name to be given to command
+            :program: the complete Spark Program in Scala, SQL, Command, R, or Python
+            :cmdline: spark-submit command line, all required information must be specify in cmdline itself.
+            :sql: inline sql query
+            :script_location: s3 location containing query statement
+            :language: language of the program, Scala, SQL, Command, R, or Python
+            :app_id: ID of an Spark job server app
+            :arguments: spark-submit command line arguments
+            :user_program_arguments: arguments that the user program takes in
+            :macros: macro values which were used in query
+        dbtapquerycmd:
+            :db_tap_id: data store ID of the target database, in Qubole.
+            :query: inline query statement
+            :macros: macro values which were used in query
         dbexportcmd:
-            :param mode: 1 (simple), 2 (advance)
-            :param hive_table: Name of the hive table
-            :param partition_spec: partition specification for Hive table.
-            :param dbtap_id: data store ID of the target database, in Qubole.
-            :param db_table: name of the db table
-            :param db_update_mode: allowinsert or updateonly
-            :param db_update_keys: columns used to determine the uniqueness of rows
-            :param export_dir: HDFS/S3 location from which data will be exported.
-            :param fields_terminated_by:
-            :param tags: array of tags to be assigned with the command
-            :param name: name to be given to command
+            :mode: 1 (simple), 2 (advance)
+            :hive_table: Name of the hive table
+            :partition_spec: partition specification for Hive table.
+            :dbtap_id: data store ID of the target database, in Qubole.
+            :db_table: name of the db table
+            :db_update_mode: allowinsert or updateonly
+            :db_update_keys: columns used to determine the uniqueness of rows
+            :export_dir: HDFS/S3 location from which data will be exported.
+            :fields_terminated_by: hex of the char used as column separator in the dataset.
         dbimportcmd:
-            :param mode: 1 (simple), 2 (advance)
-            :param hive_table: Name of the hive table
-            :param dbtap_id: data store ID of the target database, in Qubole.
-            :param db_table: name of the db table
-            :param where_clause: where clause, if any
-            :param parallelism: number of parallel db connections to use for extracting data
-            :param extract_query: SQL query to extract data from db. $CONDITIONS must be part of the where clause.
-            :param boundary_query: Query to be used get range of row IDs to be extracted
-            :param split_column: Column used as row ID to split data into ranges (mode 2)
-            :param tags: array of tags to be assigned with the command
-            :param name: name to be given to command
-
+            :mode: 1 (simple), 2 (advance)
+            :hive_table: Name of the hive table
+            :dbtap_id: data store ID of the target database, in Qubole.
+            :db_table: name of the db table
+            :where_clause: where clause, if any
+            :parallelism: number of parallel db connections to use for extracting data
+            :extract_query: SQL query to extract data from db. $CONDITIONS must be part of the where clause.
+            :boundary_query: Query to be used get range of row IDs to be extracted
+            :split_column: Column used as row ID to split data into ranges (mode 2)
+
+    .. note:: Following fields are template-supported : ``query``, ``script_location``, ``sub_command``, ``script``, ``files``,
+        ``archives``, ``program``, ``cmdline``, ``sql``, ``where_clause``, ``extract_query``, ``boundary_query``, ``macros``, ``tags``,
+        ``name``, ``parameters``. You can also use ``.txt`` files for template driven use cases.
     """
 
-    template_fields = ('query', 'script_location', 'sub_command', 'script', 'files', 'archives', 'program', 'cmdline', 'sql', 'where_clause', 'extract_query', 'boundary_query', 'macros', 'tags', 'name')
+    template_fields = ('query', 'script_location', 'sub_command', 'script', 'files', 'archives', 'program', 'cmdline',
+                       'sql', 'where_clause', 'extract_query', 'boundary_query', 'macros', 'tags', 'name', 'parameters')
     template_ext = ('.txt')
     ui_color = '#3064A1'
     ui_fgcolor = '#fff'

http://git-wip-us.apache.org/repos/asf/incubator-airflow/blob/dce08f68/docs/code.rst
----------------------------------------------------------------------
diff --git a/docs/code.rst b/docs/code.rst
index 4c046ef..8a48b81 100644
--- a/docs/code.rst
+++ b/docs/code.rst
@@ -97,6 +97,7 @@ Community-contributed Operators
         VerticaOperator,
         VerticaToHiveTransfer
 
+.. autoclass:: airflow.contrib.operators.QuboleOperator
 .. autoclass:: airflow.contrib.operators.hipchat_operator.HipChatAPIOperator
 .. autoclass:: airflow.contrib.operators.hipchat_operator.HipChatAPISendRoomNotificationOperator