You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by po...@apache.org on 2021/04/08 17:30:27 UTC

[airflow] branch v2-0-test updated: Better compatibility/diagnostics for arbitrary UID in docker image (#15162)

This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch v2-0-test
in repository https://gitbox.apache.org/repos/asf/airflow.git


The following commit(s) were added to refs/heads/v2-0-test by this push:
     new f074bcf  Better compatibility/diagnostics for arbitrary UID in docker image (#15162)
f074bcf is described below

commit f074bcf1a2dd16e457018d3daaf78c1eff81d6ec
Author: Jarek Potiuk <ja...@potiuk.com>
AuthorDate: Thu Apr 8 19:28:36 2021 +0200

    Better compatibility/diagnostics for arbitrary UID in docker image (#15162)
    
    The PROD image of airflow is OpenShift compatible and it can be
    run with either 'airflow' user (UID=50000) or with any other
    user with (GID=0).
    
    This change adds umask 0002 to make sure that whenever the image
    is extended and new directories get created, the directories are
    group-writeable for GID=0. This is added in the default
    entrypoint.
    
    The entrypoint will fail if it is not run as airflow user or if
    other, arbitrary user is used with GID != 0.
    
    Fixes: #15107
    (cherry picked from commit ce91872eccceb8fb6277012a909ad6b529a071d2)
---
 Dockerfile                                         |  2 +-
 chart/values.yaml                                  |  2 +-
 docs/docker-stack/build-arg-ref.rst                |  8 ++--
 docs/docker-stack/build.rst                        | 35 ++++++++++++++--
 .../extending/writable-directory/Dockerfile        | 21 ++++++++++
 docs/docker-stack/entrypoint.rst                   | 46 +++++++++++++++++++---
 scripts/in_container/prod/entrypoint_prod.sh       | 42 +++++++++++++++++++-
 7 files changed, 141 insertions(+), 15 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 3928057..2a05964 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -487,7 +487,7 @@ WORKDIR ${AIRFLOW_HOME}
 
 EXPOSE 8080
 
-RUN usermod -g 0 airflow
+RUN usermod -g 0 airflow -G ${AIRFLOW_GID}
 
 USER ${AIRFLOW_UID}
 
diff --git a/chart/values.yaml b/chart/values.yaml
index cbced4f..8516980 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -21,7 +21,7 @@
 
 # User and group of airflow user
 uid: 50000
-gid: 50000
+gid: 0
 
 # Airflow home directory
 # Used for mount paths
diff --git a/docs/docker-stack/build-arg-ref.rst b/docs/docker-stack/build-arg-ref.rst
index 2ec04c8..f459cb7 100644
--- a/docs/docker-stack/build-arg-ref.rst
+++ b/docs/docker-stack/build-arg-ref.rst
@@ -51,10 +51,10 @@ Those are the most common arguments that you use when you want to build a custom
 +------------------------------------------+------------------------------------------+------------------------------------------+
 | ``AIRFLOW_UID``                          | ``50000``                                | Airflow user UID.                        |
 +------------------------------------------+------------------------------------------+------------------------------------------+
-| ``AIRFLOW_GID``                          | ``50000``                                | Airflow group GID. Note that most files  |
-|                                          |                                          | created on behalf of airflow user belong |
-|                                          |                                          | to the ``root`` group (0) to keep        |
-|                                          |                                          | OpenShift Guidelines compatibility.      |
+| ``AIRFLOW_GID``                          | ``50000``                                | Airflow group GID. Note that writable    |
+|                                          |                                          | files/dirs, created on behalf of airflow |
+|                                          |                                          | user are set to the ``root`` group (0)   |
+|                                          |                                          | to allow arbitrary UID to run the image. |
 +------------------------------------------+------------------------------------------+------------------------------------------+
 | ``AIRFLOW_CONSTRAINTS_REFERENCE``        |                                          | Reference (branch or tag) from GitHub    |
 |                                          |                                          | where constraints file is taken from     |
diff --git a/docs/docker-stack/build.rst b/docs/docker-stack/build.rst
index a07a837..5fa0a59 100644
--- a/docs/docker-stack/build.rst
+++ b/docs/docker-stack/build.rst
@@ -89,6 +89,11 @@ You should be aware, about a few things:
   PIP packages are installed to ``~/.local`` folder as if the ``--user`` flag was specified when running PIP.
   Note also that using ``--no-cache-dir`` is a good idea that can help to make your image smaller.
 
+.. note::
+  Only as of ``2.0.1`` image the ``--user`` flag is turned on by default by setting ``PIP_USER`` environment
+  variable to ``true``. This can be disabled by un-setting the variable or by setting it to ``false``. In the
+  2.0.0 image you had to add the ``--user`` flag as ``pip install --user`` command.
+
 * If your apt, or PyPI dependencies require some of the ``build-essential`` or other packages that need
   to compile your python dependencies, then your best choice is to follow the "Customize the image" route,
   because you can build a highly-optimized (for size) image this way. However it requires to checkout sources
@@ -103,10 +108,22 @@ You should be aware, about a few things:
   a command ``docker build . --tag my-image:my-tag`` (where ``my-image`` is the name you want to name it
   and ``my-tag`` is the tag you want to tag the image with.
 
+* If your way of extending image requires to create writable directories, you MUST remember about adding
+  ``umask 0002`` step in your RUN command. This is necessary in order to accommodate our approach for
+  running the image with an arbitrary user. Such user will always run with ``GID=0`` -
+  the entrypoint will prevent non-root GIDs. You can read more about it in
+  :ref:`arbitrary docker user <arbitrary-docker-user>` documentation for the entrypoint. The
+  ``umask 0002`` is set as default when you enter the image, so any directories you create by default
+  in runtime, will have ``GID=0`` and will be group-writable.
+
 .. note::
-  As of 2.0.1 image the ``--user`` flag is turned on by default by setting ``PIP_USER`` environment variable
-  to ``true``. This can be disabled by un-setting the variable or by setting it to ``false``. In the
-  2.0.0 image you had to add the ``--user`` flag as ``pip install --user`` command.
+  Only as of ``2.0.2`` the default group of ``airflow`` user is ``root``. Previously it was ``airflow``,
+  so if you are building your images based on an earlier image, you need to manually change the default
+  group for airflow user:
+
+.. code-block:: docker
+
+    RUN usermod -g 0 airflow
 
 Examples of image extending
 ---------------------------
@@ -131,6 +148,18 @@ The following example adds ``lxml`` python package from PyPI to the image.
     :start-after: [START Dockerfile]
     :end-before: [END Dockerfile]
 
+A ``umask`` requiring example
+.............................
+
+The following example adds a new directory that is supposed to be writable for any arbitrary user
+running the container.
+
+.. exampleinclude:: docker-examples/extending/writable-directory/Dockerfile
+    :language: Dockerfile
+    :start-after: [START Dockerfile]
+    :end-before: [END Dockerfile]
+
+
 A ``build-essential`` requiring package example
 ...............................................
 
diff --git a/docs/docker-stack/docker-examples/extending/writable-directory/Dockerfile b/docs/docker-stack/docker-examples/extending/writable-directory/Dockerfile
new file mode 100644
index 0000000..76c6535
--- /dev/null
+++ b/docs/docker-stack/docker-examples/extending/writable-directory/Dockerfile
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is an example Dockerfile. It is not intended for PRODUCTION use
+# [START Dockerfile]
+FROM apache/airflow:2.0.1
+RUN umask 0002; \
+    mkdir -p ~/writeable-directory
+# [END Dockerfile]
diff --git a/docs/docker-stack/entrypoint.rst b/docs/docker-stack/entrypoint.rst
index 829b37e..cc89872 100644
--- a/docs/docker-stack/entrypoint.rst
+++ b/docs/docker-stack/entrypoint.rst
@@ -87,13 +87,49 @@ The image entrypoint works as follows:
   command to execute and result of this evaluation is used as ``AIRFLOW__CELERY__BROKER_URL``. The
   ``_CMD`` variable takes precedence over the ``AIRFLOW__CELERY__BROKER_URL`` variable.
 
-Creating system user
---------------------
+.. _arbitrary-docker-user:
+
+Allowing arbitrary user to run the container
+--------------------------------------------
+
+Airflow image is Open-Shift compatible, which means that you can start it with random user ID and the
+group id ``0`` (``root``). If you want to run the image with user different than Airflow, you MUST set
+GID of the user to ``0``. In case you try to use different group, the entrypoint exits with error.
+
+In order to accommodate a number of external libraries and projects, Airflow will automatically create
+such an arbitrary user in (`/etc/passwd`) and make it's home directory point to ``/home/airflow``.
+Many of 3rd-party libraries and packages require home directory of the user to be present, because they
+need to write some cache information there, so such a dynamic creation of a user is necessary.
+
+Such arbitrary user has to be able to write to certain directories that needs write access, and since
+it is not advised to allow write access to "other" for security reasons, the OpenShift
+guidelines introduced the concept of making all such folders have the ``0`` (``root``) group id (GID).
+All the directories that need write access in the Airflow production image have GID set to 0 (and
+they are writable for the group). We are following that concept and all the directories that need
+write access follow that.
+
+The GID=0 is set as default for the ``airflow`` user, so any directories it creates have GID set to 0
+by default. The entrypoint sets ``umask`` to be ``0002`` - this means that any directories created by
+the user have also "group write" access for group ``0`` - they will be writable by other users with
+``root`` group. Also whenever any "arbitrary" user creates a folder (for example in a mounted volume), that
+folder will have a "group write" access and ``GID=0``, so that execution with another, arbitrary user
+will still continue to work, even if such directory is mounted by another arbitrary user later.
+
+The ``umask`` setting however only works for runtime of the container - it is not used during building of
+the image. If you would like to extend the image and add your own packages, you should remember to add
+``umask 0002`` in front of your docker command - this way the directories created by any installation
+that need group access will also be writable for the group. This can be done for example this way:
+
+  .. code-block:: docker
+
+      RUN umask 0002; \
+          do_something; \
+          do_otherthing;
+
 
-Airflow image is Open-Shift compatible, which means that you can start it with random user ID and group id 0.
-Airflow will automatically create such a user and make it's home directory point to ``/home/airflow``.
 You can read more about it in the "Support arbitrary user ids" chapter in the
-`Openshift best practices <https://docs.openshift.com/container-platform/4.1/openshift_images/create-images.html#images-create-guide-openshift_create-images>`_.
+`Openshift best practices <https://docs.openshift.com/container-platform/4.7/openshift_images/create-images.html#images-create-guide-openshift_create-images>`_.
+
 
 Waits for Airflow DB connection
 -------------------------------
diff --git a/scripts/in_container/prod/entrypoint_prod.sh b/scripts/in_container/prod/entrypoint_prod.sh
index 12d18e8..4ca8a75 100755
--- a/scripts/in_container/prod/entrypoint_prod.sh
+++ b/scripts/in_container/prod/entrypoint_prod.sh
@@ -15,7 +15,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
 # Might be empty
 AIRFLOW_COMMAND="${1}"
 
@@ -244,6 +243,47 @@ function exec_to_bash_or_python_command_if_specified() {
     fi
 }
 
+function check_uid_gid() {
+    if [[ $(id -g) == "0" ]]; then
+        return
+    fi
+    if [[ $(id -u) == "50000" ]]; then
+        >&2 echo
+        >&2 echo "WARNING! You should run the image with GID (Group ID) set to 0"
+        >&2 echo "         even if you use 'airflow' user (UID=50000)"
+        >&2 echo
+        >&2 echo " You started the image with UID=$(id -u) and GID=$(id -g)"
+        >&2 echo
+        >&2 echo " This is to make sure you can run the image with an arbitrary UID in the future."
+        >&2 echo
+        >&2 echo " See more about it in the Airflow's docker image documentation"
+        >&2 echo "     http://airflow.apache.org/docs/docker-stack/entrypoint"
+        >&2 echo
+        # We still allow the image to run with `airflow` user.
+        return
+    else
+        >&2 echo
+        >&2 echo "ERROR! You should run the image with GID=0"
+        >&2 echo
+        >&2 echo " You started the image with UID=$(id -u) and GID=$(id -g)"
+        >&2 echo
+        >&2 echo "The image should always be run with GID (Group ID) set to 0 regardless of the UID used."
+        >&2 echo " This is to make sure you can run the image with an arbitrary UID."
+        >&2 echo
+        >&2 echo " See more about it in the Airflow's docker image documentation"
+        >&2 echo "     http://airflow.apache.org/docs/docker-stack/entrypoint"
+        # This will not work so we fail hard
+        exit 1
+    fi
+}
+
+check_uid_gid
+
+# Set umask to 0002 to make all the directories created by the current user group-writeable
+# This allows the same directories to be writeable for any arbitrary user the image will be
+# run with, when the directory is created on a mounted volume and when that volume is later
+# reused with a different UID (but with GID=0)
+umask 0002
 
 CONNECTION_CHECK_MAX_COUNT=${CONNECTION_CHECK_MAX_COUNT:=20}
 readonly CONNECTION_CHECK_MAX_COUNT