You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by ka...@apache.org on 2021/03/07 21:34:20 UTC
[airflow] branch master updated: Created initial guide for HDFS
operators (#11212)
This is an automated email from the ASF dual-hosted git repository.
kaxilnaik pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/master by this push:
new 71c5979 Created initial guide for HDFS operators (#11212)
71c5979 is described below
commit 71c5979a175364f1fb377874f67705c52c9afc38
Author: Aditya Trivedi <ia...@gmail.com>
AuthorDate: Mon Mar 8 03:04:08 2021 +0530
Created initial guide for HDFS operators (#11212)
closes https://github.com/apache/airflow/issues/8197
Co-authored-by: Kaxil Naik <ka...@gmail.com>
---
airflow/providers/apache/hdfs/provider.yaml | 2 +
airflow/providers/apache/hdfs/sensors/hdfs.py | 24 ++++++-
.../connections.rst | 44 ++++++++++++
.../apache-airflow-providers-apache-hdfs/index.rst | 7 ++
.../operators.rst | 79 ++++++++++++++++++++++
5 files changed, 153 insertions(+), 3 deletions(-)
diff --git a/airflow/providers/apache/hdfs/provider.yaml b/airflow/providers/apache/hdfs/provider.yaml
index 5983009..afba361 100644
--- a/airflow/providers/apache/hdfs/provider.yaml
+++ b/airflow/providers/apache/hdfs/provider.yaml
@@ -29,6 +29,8 @@ versions:
integrations:
- integration-name: Hadoop Distributed File System (HDFS)
external-doc-url: https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html
+ how-to-guide:
+ - /docs/apache-airflow-providers-apache-hdfs/operators.rst
logo: /integration-logos/apache/hadoop.png
tags: [apache]
- integration-name: WebHDFS
diff --git a/airflow/providers/apache/hdfs/sensors/hdfs.py b/airflow/providers/apache/hdfs/sensors/hdfs.py
index 867c193..518748a 100644
--- a/airflow/providers/apache/hdfs/sensors/hdfs.py
+++ b/airflow/providers/apache/hdfs/sensors/hdfs.py
@@ -29,7 +29,13 @@ log = logging.getLogger(__name__)
class HdfsSensor(BaseSensorOperator):
- """Waits for a file or folder to land in HDFS"""
+ """
+ Waits for a file or folder to land in HDFS
+
+ .. seealso::
+ For more information on how to use this operator, take a look at the guide:
+ :ref:`howto/operator:HdfsSensor`
+ """
template_fields = ('filepath',)
ui_color = settings.WEB_COLORS['LIGHTBLUE']
@@ -122,7 +128,13 @@ class HdfsSensor(BaseSensorOperator):
class HdfsRegexSensor(HdfsSensor):
- """Waits for matching files by matching on regex"""
+ """
+ Waits for matching files by matching on regex
+
+ .. seealso::
+ For more information on how to use this operator, take a look at the guide:
+ :ref:`howto/operator:HdfsRegexSensor`
+ """
def __init__(self, regex: Pattern[str], *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
@@ -149,7 +161,13 @@ class HdfsRegexSensor(HdfsSensor):
class HdfsFolderSensor(HdfsSensor):
- """Waits for a non-empty directory"""
+ """
+ Waits for a non-empty directory
+
+ .. seealso::
+ For more information on how to use this operator, take a look at the guide:
+ :ref:`howto/operator:HdfsFolderSensor`
+ """
def __init__(self, be_empty: bool = False, *args: Any, **kwargs: Any):
super().__init__(*args, **kwargs)
diff --git a/docs/apache-airflow-providers-apache-hdfs/connections.rst b/docs/apache-airflow-providers-apache-hdfs/connections.rst
new file mode 100644
index 0000000..d0ad36d
--- /dev/null
+++ b/docs/apache-airflow-providers-apache-hdfs/connections.rst
@@ -0,0 +1,44 @@
+ .. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ .. http://www.apache.org/licenses/LICENSE-2.0
+
+ .. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Apache HDFS Connection
+======================
+
+The Apache HDFS connection type enables connection to Apache HDFS.
+
+Default Connection IDs
+----------------------
+
+HDFS Hook uses parameter ``hdfs_conn_id`` for Connection IDs and the value of the parameter
+as ``hdfs_default`` by default.
+Web HDFS Hook uses parameter ``webhdfs_conn_id`` for Connection IDs and the value of the
+parameter as ``webhdfs_default`` by default.
+
+Configuring the Connection
+--------------------------
+Host
+ The host to connect to, it can be ``local``, ``yarn`` or an URL.
+
+Port
+ Specify the port in case of host be an URL.
+
+Extra (optional, connection parameters)
+ Specify the extra parameters (as json dictionary) that can be used in HDFS connection. The following
+ parameters out of the standard python parameters are supported:
+
+ * ``proxy_user`` - Effective user for HDFS operations.
+ * ``autoconfig`` - Default value is bool: False. Use snakebite's automatically configured client. This HDFSHook implementation requires snakebite.
diff --git a/docs/apache-airflow-providers-apache-hdfs/index.rst b/docs/apache-airflow-providers-apache-hdfs/index.rst
index d24dfaa..02e86d7 100644
--- a/docs/apache-airflow-providers-apache-hdfs/index.rst
+++ b/docs/apache-airflow-providers-apache-hdfs/index.rst
@@ -24,6 +24,13 @@ Content
.. toctree::
:maxdepth: 1
+ :caption: Guides
+
+ Connection types <connections>
+ Operators <operators>
+
+.. toctree::
+ :maxdepth: 1
:caption: References
Python API <_api/airflow/providers/apache/hdfs/index>
diff --git a/docs/apache-airflow-providers-apache-hdfs/operators.rst b/docs/apache-airflow-providers-apache-hdfs/operators.rst
new file mode 100644
index 0000000..a73b99a
--- /dev/null
+++ b/docs/apache-airflow-providers-apache-hdfs/operators.rst
@@ -0,0 +1,79 @@
+ .. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ .. http://www.apache.org/licenses/LICENSE-2.0
+
+ .. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+
+
+Apache Hadoop HDFS Operators
+============================
+
+
+`Apache Hadoop HDFS <https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html>`__ is a distributed file system
+designed to run on commodity hardware. It has many similarities with existing distributed file systems.
+However, the differences from other distributed file systems are significant.
+HDFS is highly fault-tolerant and is designed to be deployed on low-cost hardware.
+HDFS provides high throughput access to application data and is suitable for applications that have
+large data sets. HDFS relaxes a few POSIX requirements to enable streaming access to file
+system data. HDFS is now an Apache Hadoop sub project.
+
+.. contents::
+ :depth: 1
+ :local:
+
+Prerequisite
+------------
+
+To use operators, you must configure a :doc:`HDFS Connection <connections>`.
+
+.. _howto/operator:HdfsFolderSensor:
+
+HdfsFolderSensor
+----------------
+Waits for a non-empty directory
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :class:`~airflow.providers.apache.hdfs.sensors.hdfs.HdfsFolderSensor` operator is used to
+check for a non-empty directory in HDFS.
+
+Use the ``filepath`` parameter to poke until the provided file is found.
+
+.. _howto/operator:HdfsRegexSensor:
+
+HdfsRegexSensor
+---------------
+Waits for matching files by matching on regex
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :class:`~airflow.providers.apache.hdfs.sensors.hdfs.HdfsRegexSensor` operator is used to check for
+matching files by matching on regex in HDFS.
+
+Use the ``filepath`` parameter to mention the keyspace and table for the record. Use dot notation to target a
+specific keyspace.
+
+
+.. _howto/operator:HdfsSensor:
+
+Waits for a file or folder to land in HDFS
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :class:`~airflow.providers.apache.hdfs.sensors.hdfs.HdfsSensor` operator is used to check for a file or folder to land in HDFS.
+
+Use the ``filepath`` parameter to poke until the provided file is found.
+
+Reference
+^^^^^^^^^
+
+For further information, look at `HDFS Architecture Guide <https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html>`_.