You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by ka...@apache.org on 2021/03/07 21:34:20 UTC

[airflow] branch master updated: Created initial guide for HDFS operators (#11212)

This is an automated email from the ASF dual-hosted git repository.

kaxilnaik pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/airflow.git


The following commit(s) were added to refs/heads/master by this push:
     new 71c5979  Created initial guide for HDFS operators  (#11212)
71c5979 is described below

commit 71c5979a175364f1fb377874f67705c52c9afc38
Author: Aditya Trivedi <ia...@gmail.com>
AuthorDate: Mon Mar 8 03:04:08 2021 +0530

    Created initial guide for HDFS operators  (#11212)
    
    closes https://github.com/apache/airflow/issues/8197
    
    Co-authored-by: Kaxil Naik <ka...@gmail.com>
---
 airflow/providers/apache/hdfs/provider.yaml        |  2 +
 airflow/providers/apache/hdfs/sensors/hdfs.py      | 24 ++++++-
 .../connections.rst                                | 44 ++++++++++++
 .../apache-airflow-providers-apache-hdfs/index.rst |  7 ++
 .../operators.rst                                  | 79 ++++++++++++++++++++++
 5 files changed, 153 insertions(+), 3 deletions(-)

diff --git a/airflow/providers/apache/hdfs/provider.yaml b/airflow/providers/apache/hdfs/provider.yaml
index 5983009..afba361 100644
--- a/airflow/providers/apache/hdfs/provider.yaml
+++ b/airflow/providers/apache/hdfs/provider.yaml
@@ -29,6 +29,8 @@ versions:
 integrations:
   - integration-name: Hadoop Distributed File System (HDFS)
     external-doc-url: https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html
+    how-to-guide:
+      - /docs/apache-airflow-providers-apache-hdfs/operators.rst
     logo: /integration-logos/apache/hadoop.png
     tags: [apache]
   - integration-name: WebHDFS
diff --git a/airflow/providers/apache/hdfs/sensors/hdfs.py b/airflow/providers/apache/hdfs/sensors/hdfs.py
index 867c193..518748a 100644
--- a/airflow/providers/apache/hdfs/sensors/hdfs.py
+++ b/airflow/providers/apache/hdfs/sensors/hdfs.py
@@ -29,7 +29,13 @@ log = logging.getLogger(__name__)
 
 
 class HdfsSensor(BaseSensorOperator):
-    """Waits for a file or folder to land in HDFS"""
+    """
+    Waits for a file or folder to land in HDFS
+
+    .. seealso::
+        For more information on how to use this operator, take a look at the guide:
+        :ref:`howto/operator:HdfsSensor`
+    """
 
     template_fields = ('filepath',)
     ui_color = settings.WEB_COLORS['LIGHTBLUE']
@@ -122,7 +128,13 @@ class HdfsSensor(BaseSensorOperator):
 
 
 class HdfsRegexSensor(HdfsSensor):
-    """Waits for matching files by matching on regex"""
+    """
+    Waits for matching files by matching on regex
+
+    .. seealso::
+        For more information on how to use this operator, take a look at the guide:
+        :ref:`howto/operator:HdfsRegexSensor`
+    """
 
     def __init__(self, regex: Pattern[str], *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
@@ -149,7 +161,13 @@ class HdfsRegexSensor(HdfsSensor):
 
 
 class HdfsFolderSensor(HdfsSensor):
-    """Waits for a non-empty directory"""
+    """
+    Waits for a non-empty directory
+
+    .. seealso::
+        For more information on how to use this operator, take a look at the guide:
+        :ref:`howto/operator:HdfsFolderSensor`
+    """
 
     def __init__(self, be_empty: bool = False, *args: Any, **kwargs: Any):
         super().__init__(*args, **kwargs)
diff --git a/docs/apache-airflow-providers-apache-hdfs/connections.rst b/docs/apache-airflow-providers-apache-hdfs/connections.rst
new file mode 100644
index 0000000..d0ad36d
--- /dev/null
+++ b/docs/apache-airflow-providers-apache-hdfs/connections.rst
@@ -0,0 +1,44 @@
+ .. Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+ ..   http://www.apache.org/licenses/LICENSE-2.0
+
+ .. Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+Apache HDFS Connection
+======================
+
+The Apache HDFS connection type enables connection to Apache HDFS.
+
+Default Connection IDs
+----------------------
+
+HDFS Hook uses parameter ``hdfs_conn_id`` for Connection IDs and the value of the parameter
+as ``hdfs_default`` by default.
+Web HDFS Hook uses parameter ``webhdfs_conn_id`` for Connection IDs and the value of the
+parameter as ``webhdfs_default`` by default.
+
+Configuring the Connection
+--------------------------
+Host
+    The host to connect to, it can be ``local``, ``yarn`` or an URL.
+
+Port
+    Specify the port in case of host be an URL.
+
+Extra (optional, connection parameters)
+    Specify the extra parameters (as json dictionary) that can be used in HDFS connection. The following
+    parameters out of the standard python parameters are supported:
+
+    * ``proxy_user`` - Effective user for HDFS operations.
+    * ``autoconfig`` - Default value is bool: False. Use snakebite's automatically configured client. This HDFSHook implementation requires snakebite.
diff --git a/docs/apache-airflow-providers-apache-hdfs/index.rst b/docs/apache-airflow-providers-apache-hdfs/index.rst
index d24dfaa..02e86d7 100644
--- a/docs/apache-airflow-providers-apache-hdfs/index.rst
+++ b/docs/apache-airflow-providers-apache-hdfs/index.rst
@@ -24,6 +24,13 @@ Content
 
 .. toctree::
     :maxdepth: 1
+    :caption: Guides
+
+    Connection types <connections>
+    Operators <operators>
+
+.. toctree::
+    :maxdepth: 1
     :caption: References
 
     Python API <_api/airflow/providers/apache/hdfs/index>
diff --git a/docs/apache-airflow-providers-apache-hdfs/operators.rst b/docs/apache-airflow-providers-apache-hdfs/operators.rst
new file mode 100644
index 0000000..a73b99a
--- /dev/null
+++ b/docs/apache-airflow-providers-apache-hdfs/operators.rst
@@ -0,0 +1,79 @@
+ .. Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+ ..   http://www.apache.org/licenses/LICENSE-2.0
+
+ .. Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+
+
+Apache Hadoop HDFS Operators
+============================
+
+
+`Apache Hadoop HDFS <https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html>`__ is a distributed file system
+designed to run on commodity hardware. It has many similarities with existing distributed file systems.
+However, the differences from other distributed file systems are significant.
+HDFS is highly fault-tolerant and is designed to be deployed on low-cost hardware.
+HDFS provides high throughput access to application data and is suitable for applications that have
+large data sets. HDFS relaxes a few POSIX requirements to enable streaming access to file
+system data. HDFS is now an Apache Hadoop sub project.
+
+.. contents::
+  :depth: 1
+  :local:
+
+Prerequisite
+------------
+
+To use operators, you must configure a :doc:`HDFS Connection <connections>`.
+
+.. _howto/operator:HdfsFolderSensor:
+
+HdfsFolderSensor
+----------------
+Waits for a non-empty directory
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :class:`~airflow.providers.apache.hdfs.sensors.hdfs.HdfsFolderSensor` operator is used to
+check for a non-empty directory in HDFS.
+
+Use the ``filepath`` parameter to poke until the provided file is found.
+
+.. _howto/operator:HdfsRegexSensor:
+
+HdfsRegexSensor
+---------------
+Waits for matching files by matching on regex
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :class:`~airflow.providers.apache.hdfs.sensors.hdfs.HdfsRegexSensor` operator is used to check for
+matching files by matching on regex in HDFS.
+
+Use the ``filepath`` parameter to mention the keyspace and table for the record. Use dot notation to target a
+specific keyspace.
+
+
+.. _howto/operator:HdfsSensor:
+
+Waits for a file or folder to land in HDFS
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :class:`~airflow.providers.apache.hdfs.sensors.hdfs.HdfsSensor` operator is used to check for a file or folder to land in HDFS.
+
+Use the ``filepath`` parameter to poke until the provided file is found.
+
+Reference
+^^^^^^^^^
+
+For further information, look at `HDFS Architecture Guide  <https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html>`_.