You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by po...@apache.org on 2020/11/14 15:07:54 UTC

[airflow] 05/05: Simplifies check whether the CI image should be rebuilt (#12181)

This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch v1-10-test
in repository https://gitbox.apache.org/repos/asf/airflow.git

commit 430c047a0605125e463fdd3b45e7a848e2d26c13
Author: Jarek Potiuk <ja...@polidea.com>
AuthorDate: Fri Nov 13 22:21:39 2020 +0100

    Simplifies check whether the CI image should be rebuilt (#12181)
    
    Rather than counting changed layers in the image (which was
    enigmatic, difficult and prone to some magic number) we rely now
    on random file generated while building the image.
    
    We are using the docker image caching mechanism here. The random
    file will be regenerated only when the previous layer (which is
    about installling Airflow dependencies for the first time) gets
    rebuild. And for us this is the indication, that the building
    the image will take quite some time. This layer should be
    relatively static - even if setup.py changes the CI image is
    designed in the way that the first time installation of Airflow
    dependencies is not invalidated.
    
    This should lead to faster and less frequent rebuild for people
    using Breeze and static checks.
    
    (cherry picked from commit 167b9b9889ac5481b21cb35c6cdef5869b8ab713)
---
 Dockerfile.ci                           |   9 +-
 breeze                                  |   6 +-
 manifests/.gitignore                    |   2 +-
 scripts/ci/libraries/_build_images.sh   | 163 ++++++++++++++++----------------
 scripts/ci/libraries/_initialization.sh |  19 +++-
 5 files changed, 106 insertions(+), 93 deletions(-)

diff --git a/Dockerfile.ci b/Dockerfile.ci
index 7699706..f06087b 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -132,7 +132,9 @@ ARG RUNTIME_APT_DEPS="\
       sqlite3 \
       tmux \
       unzip \
-      vim"
+      vim \
+      xxd"
+ENV RUNTIME_APT_DEP=${RUNTIME_APT_DEPS}
 
 ARG ADDITIONAL_RUNTIME_APT_DEPS=""
 ENV ADDITIONAL_RUNTIME_APT_DEPS=${ADDITIONAL_RUNTIME_APT_DEPS}
@@ -275,6 +277,11 @@ RUN if [[ ${AIRFLOW_PRE_CACHED_PIP_PACKAGES} == "true" ]]; then \
     fi
 
 
+# Generate random hex dump file so that we can determine whether it's faster to rebuild the image
+# using current cache (when our dump is the same as the remote onb) or better to pull
+# the new image (when it is different)
+RUN head -c 30 /dev/urandom | xxd -ps >/build-cache-hash
+
 # Link dumb-init for backwards compatibility (so that older images also work)
 RUN ln -sf /usr/bin/dumb-init /usr/local/bin/dumb-init
 
diff --git a/breeze b/breeze
index 55412d0..175a4ab 100755
--- a/breeze
+++ b/breeze
@@ -2873,11 +2873,9 @@ function breeze::run_build_command() {
             build_images::prepare_prod_build
             build_images::build_prod_images
         else
+
             build_images::prepare_ci_build
-            md5sum::calculate_md5sum_for_all_files
-            build_images::build_ci_image
-            md5sum::update_all_md5
-            build_images::build_ci_image_manifest
+            build_images::rebuild_ci_image_if_needed
         fi
         ;;
     cleanup_image | run_exec)
diff --git a/manifests/.gitignore b/manifests/.gitignore
index a6c57f5..72e8ffc 100644
--- a/manifests/.gitignore
+++ b/manifests/.gitignore
@@ -1 +1 @@
-*.json
+*
diff --git a/scripts/ci/libraries/_build_images.sh b/scripts/ci/libraries/_build_images.sh
index a2577cb..52b8a84 100644
--- a/scripts/ci/libraries/_build_images.sh
+++ b/scripts/ci/libraries/_build_images.sh
@@ -136,10 +136,18 @@ function build_images::confirm_image_rebuild() {
             ;;
         esac
     elif [[ -t 0 ]]; then
+        echo
+        echo
+        echo "Make sure that you rebased to latest master before rebuilding!"
+        echo
         # Check if this script is run interactively with stdin open and terminal attached
         "${AIRFLOW_SOURCES}/confirm" "${ACTION} image ${THE_IMAGE_TYPE}-python${PYTHON_MAJOR_MINOR_VERSION}"
         RES=$?
     elif [[ ${DETECTED_TERMINAL:=$(tty)} != "not a tty" ]]; then
+        echo > "${DETECTED_TERMINAL}"
+        echo > "${DETECTED_TERMINAL}"
+        echo "Make sure that you rebased to latest master before rebuilding!" > "${DETECTED_TERMINAL}"
+        echo > "${DETECTED_TERMINAL}"
         # Make sure to use output of tty rather than stdin/stdout when available - this way confirm
         # will works also in case of pre-commits (git does not pass stdin/stdout to pre-commit hooks)
         # shellcheck disable=SC2094
@@ -151,6 +159,10 @@ function build_images::confirm_image_rebuild() {
         export DETECTED_TERMINAL=/dev/tty
         # Make sure to use /dev/tty first rather than stdin/stdout when available - this way confirm
         # will works also in case of pre-commits (git does not pass stdin/stdout to pre-commit hooks)
+        echo > "${DETECTED_TERMINAL}"
+        echo > "${DETECTED_TERMINAL}"
+        echo "Make sure that you rebased to latest master before rebuilding!" > "${DETECTED_TERMINAL}"
+        echo > "${DETECTED_TERMINAL}"
         # shellcheck disable=SC2094
         "${AIRFLOW_SOURCES}/confirm" "${ACTION} image ${THE_IMAGE_TYPE}-python${PYTHON_MAJOR_MINOR_VERSION}" \
             <"${DETECTED_TERMINAL}" >"${DETECTED_TERMINAL}"
@@ -193,113 +205,95 @@ function build_images::confirm_image_rebuild() {
 # We cannot use docker registry APIs as they are available only with authorisation
 # But this image can be pulled without authentication
 function build_images::build_ci_image_manifest() {
-    docker inspect "${AIRFLOW_CI_IMAGE}" >"manifests/${AIRFLOW_CI_BASE_TAG}.json"
     docker build \
-        --build-arg AIRFLOW_CI_BASE_TAG="${AIRFLOW_CI_BASE_TAG}" \
         --tag="${AIRFLOW_CI_LOCAL_MANIFEST_IMAGE}" \
         -f- . <<EOF
-ARG AIRFLOW_CI_BASE_TAG
 FROM scratch
 
-COPY "manifests/${AIRFLOW_CI_BASE_TAG}.json" .
+COPY "manifests/local-build-cache-hash" /build-cache-hash
 
 CMD ""
 EOF
 }
 
 #
-# Retrieves information about layers in the local IMAGE
-# it stores list of SHAs of image layers in the file pointed at by TMP_MANIFEST_LOCAL_SHA
+# Retrieves information about build cache hash random file from the local image
 #
-function build_images::get_local_image_info() {
-    TMP_MANIFEST_LOCAL_JSON=$(mktemp)
-    TMP_MANIFEST_LOCAL_SHA=$(mktemp)
+function build_images::get_local_build_cache_hash() {
+
     set +e
     # Remove the container just in case
-    docker rm --force "local-airflow-manifest" 2>/dev/null >/dev/null
-    # Create manifest from the local manifest image
-    if ! docker create --name "local-airflow-manifest" "${AIRFLOW_CI_LOCAL_MANIFEST_IMAGE}" 2>/dev/null; then
-        echo
-        echo "Local manifest image not available"
-        echo
+    docker rm --force "local-airflow-ci-container" 2>/dev/null >/dev/null
+    if ! docker create --name "local-airflow-ci-container" "${AIRFLOW_CI_IMAGE}" 2>/dev/null; then
+        >&2 echo
+        >&2 echo "Local airflow CI image not available"
+        >&2 echo
         LOCAL_MANIFEST_IMAGE_UNAVAILABLE="true"
+        export LOCAL_MANIFEST_IMAGE_UNAVAILABLE
+        touch "${LOCAL_IMAGE_BUILD_CACHE_HASH_FILE}"
         return
     fi
-    # Create manifest from the local manifest image
-    docker cp "local-airflow-manifest:${AIRFLOW_CI_BASE_TAG}.json" "${TMP_MANIFEST_LOCAL_JSON}"
-    sed 's/ *//g' "${TMP_MANIFEST_LOCAL_JSON}" | grep '^"sha256:' >"${TMP_MANIFEST_LOCAL_SHA}"
-    docker rm --force "local-airflow-manifest" 2>/dev/null >/dev/null
+    docker cp "local-airflow-ci-container:/build-cache-hash" \
+        "${LOCAL_IMAGE_BUILD_CACHE_HASH_FILE}" 2> /dev/null \
+        || touch "${LOCAL_IMAGE_BUILD_CACHE_HASH_FILE}"
     set -e
+    echo
+    echo "Local build cache hash: '$(cat "${LOCAL_IMAGE_BUILD_CACHE_HASH_FILE}")'"
+    echo
 }
 
-#
-# Retrieves information about layers in the remote IMAGE
-# it stores list of SHAs of image layers in the file pointed at by TMP_MANIFEST_REMOTE_SHA
-# This cannot be done easily with existing APIs of Dockerhub because they require additional authentication
-# even for public images. Therefore instead we are downloading a specially prepared manifest image
-# which is built together with the main image. This special manifest image is prepared during
-# building of the main image and contains single JSON file being result of docker inspect on that image
-# This image is from scratch so it is very tiny
-function build_images::get_remote_image_info() {
+# Retrieves information about the build cache hash random file from the remote image.
+# We actually use manifest image for that, which is a really, really small image to pull!
+# The problem is that inspecting information about remote image cannot be done easily with existing APIs
+# of Dockerhub because they require additional authentication even for public images.
+# Therefore instead we are downloading a specially prepared manifest image
+# which is built together with the main image and pushed with it. This special manifest image is prepared
+# during building of the main image and contains single file which is randomly built during the docker
+# build in the right place in the image (right after installing all dependencies of Apache Airflow
+# for the first time). When this random file gets regenerated it means that either base image has
+# changed or some of the earlier layers was modified - which means that it is usually faster to pull
+# that image first and then rebuild it - because this will likely be faster
+function build_images::get_remote_image_build_cache_hash() {
     set +e
     # Pull remote manifest image
     if ! docker pull "${AIRFLOW_CI_REMOTE_MANIFEST_IMAGE}" 2>/dev/null >/dev/null; then
-        echo >&2
-        echo >&2 "Remote docker registry unreachable"
-        echo >&2
+        >&2 echo
+        >&2 echo "Remote docker registry unreachable"
+        >&2 echo
         REMOTE_DOCKER_REGISTRY_UNREACHABLE="true"
+        export REMOTE_DOCKER_REGISTRY_UNREACHABLE
+        touch "${REMOTE_IMAGE_BUILD_CACHE_HASH_FILE}"
         return
     fi
     set -e
-
-    # Docker needs the file passed to --cidfile to not exist, so we can't use mktemp
-    TMP_CONTAINER_DIR="$(mktemp -d)"
-    TMP_CONTAINER_ID="${TMP_CONTAINER_DIR}/remote-airflow-manifest-$$.container_id"
-    FILES_TO_CLEANUP_ON_EXIT+=("$TMP_CONTAINER_ID")
-
-    TMP_MANIFEST_REMOTE_JSON=$(mktemp)
-    TMP_MANIFEST_REMOTE_SHA=$(mktemp)
-    # Create container out of the manifest image without running it
-    docker create --cidfile "${TMP_CONTAINER_ID}" "${AIRFLOW_CI_REMOTE_MANIFEST_IMAGE}" 2>/dev/null >/dev/null
+    # Create container dump out of the manifest image without actually running it
+    docker create --cidfile "${REMOTE_IMAGE_CONTAINER_ID_FILE}" "${AIRFLOW_CI_REMOTE_MANIFEST_IMAGE}" \
+        2>/dev/null >/dev/null || true
     # Extract manifest and store it in local file
-    docker cp "$(cat "${TMP_CONTAINER_ID}"):${AIRFLOW_CI_BASE_TAG}.json" "${TMP_MANIFEST_REMOTE_JSON}"
-    # Filter everything except SHAs of image layers
-    sed 's/ *//g' "${TMP_MANIFEST_REMOTE_JSON}" | grep '^"sha256:' >"${TMP_MANIFEST_REMOTE_SHA}"
-    docker rm --force "$(cat "${TMP_CONTAINER_ID}")" 2>/dev/null >/dev/null
+    docker cp "$(cat "${REMOTE_IMAGE_CONTAINER_ID_FILE}"):/build-cache-hash" \
+        "${REMOTE_IMAGE_BUILD_CACHE_HASH_FILE}" 2> /dev/null \
+        || touch "${REMOTE_IMAGE_BUILD_CACHE_HASH_FILE}"
+    docker rm --force "$(cat "${REMOTE_IMAGE_CONTAINER_ID_FILE}")" 2>/dev/null || true
+    echo
+    echo "Remote build cache hash: '$(cat "${REMOTE_IMAGE_BUILD_CACHE_HASH_FILE}")'"
+    echo
 }
 
-# The Number determines the cut-off between local building time and pull + build time.
-# It is a bit experimental and it will have to be kept
-# updated as we keep on changing layers. The cut-off point is at the moment when we do first
-# pip install "https://github.com/apache/airflow/archive/${AIRFLOW_BRANCH}.tar...
-# you can get it via this command:
-# docker history --no-trunc  apache/airflow:master-python3.6-ci | \
-#      grep ^sha256 | grep -n "pip uninstall" | awk 'BEGIN {FS=":"} {print $1 }'
-#
-# This command returns the number of layer in docker history where pip uninstall is called. This is the
-# line that will take a lot of time to run and at this point it's worth to pull the image from repo
-# if there are at least NN changed layers in your docker file, you should pull the image.
-#
-# Note that this only matters if you have any of the important files changed since the last build
-# of your image such as Dockerfile.ci, setup.py etc.
-#
-MAGIC_CUT_OFF_NUMBER_OF_LAYERS=36
-
 # Compares layers from both remote and local image and set FORCE_PULL_IMAGES to true in case
 # More than the last NN layers are different.
-function build_images::compare_layers() {
-    NUM_DIFF=$(diff -y --suppress-common-lines "${TMP_MANIFEST_REMOTE_SHA}" "${TMP_MANIFEST_LOCAL_SHA}" |
-        wc -l || true)
-    rm -f "${TMP_MANIFEST_REMOTE_JSON}" "${TMP_MANIFEST_REMOTE_SHA}" "${TMP_MANIFEST_LOCAL_JSON}" "${TMP_MANIFEST_LOCAL_SHA}"
-    echo
-    echo "Number of layers different between the local and remote image: ${NUM_DIFF}"
-    echo
-    # This is where setup py is rebuilt - it will usually take a looooot of time to build it, so it is
-    # Better to pull here
-    if ((NUM_DIFF >= MAGIC_CUT_OFF_NUMBER_OF_LAYERS)); then
+function build_images::compare_local_and_remote_build_cache_hash() {
+    set +e
+    local remote_hash
+    remote_hash=$(cat "${REMOTE_IMAGE_BUILD_CACHE_HASH_FILE}")
+    local local_hash
+    local_hash=$(cat "${LOCAL_IMAGE_BUILD_CACHE_HASH_FILE}")
+
+    if [[ ${remote_hash} != "${local_hash}" ||
+        ${local_hash} == "" ]]; then
         echo
         echo
-        echo "WARNING! Your image and the dockerhub image differ significantly"
+        echo "Your image and the dockerhub have different or missing build cache hashes."
+        echo "Local hash: '${local_hash}'. Remote hash: '${remote_hash}'."
         echo
         echo "Forcing pulling the images. It will be faster than rebuilding usually."
         echo "You can avoid it by setting SKIP_CHECK_REMOTE_IMAGE to true"
@@ -307,9 +301,10 @@ function build_images::compare_layers() {
         export FORCE_PULL_IMAGES="true"
     else
         echo
-        echo "No need to pull the image. Local rebuild will be faster"
+        echo "No need to pull the image. Yours and remote cache hashes are the same!"
         echo
     fi
+    set -e
 }
 
 # Prints summary of the build parameters
@@ -335,7 +330,6 @@ function build_images::get_docker_image_names() {
     # CI image to build
     export AIRFLOW_CI_IMAGE="${DOCKERHUB_USER}/${DOCKERHUB_REPO}:${AIRFLOW_CI_BASE_TAG}"
 
-
     # Base production image tag - used to build kubernetes tag as well
     if [[ ${FORCE_AIRFLOW_PROD_BASE_TAG=} == "" ]]; then
         export AIRFLOW_PROD_BASE_TAG="${BRANCH_NAME}-python${PYTHON_MAJOR_MINOR_VERSION}"
@@ -396,6 +390,9 @@ function build_images::prepare_ci_build() {
 # In case rebuild is needed, it determines (by comparing layers in local and remote image)
 # Whether pull is needed before rebuild.
 function build_images::rebuild_ci_image_if_needed() {
+    verbosity::print_info
+    verbosity::print_info "Checking if pull or just build for ${THE_IMAGE_TYPE} is needed."
+    verbosity::print_info
     if [[ -f "${BUILT_CI_IMAGE_FLAG_FILE}" ]]; then
         verbosity::print_info
         verbosity::print_info "${THE_IMAGE_TYPE} image already built locally."
@@ -418,6 +415,7 @@ function build_images::rebuild_ci_image_if_needed() {
 
     local needs_docker_build="false"
     md5sum::check_if_docker_build_is_needed
+    build_images::get_local_build_cache_hash
     if [[ ${needs_docker_build} == "true" ]]; then
         if [[ ${SKIP_CHECK_REMOTE_IMAGE:=} != "true" && ${DOCKER_CACHE} == "pulled" ]]; then
             # Check if remote image is different enough to force pull
@@ -427,14 +425,12 @@ function build_images::rebuild_ci_image_if_needed() {
             echo
             echo "Checking if the remote image needs to be pulled"
             echo
-            build_images::get_remote_image_info
-            if [[ ${REMOTE_DOCKER_REGISTRY_UNREACHABLE:=} != "true" ]]; then
-                build_images::get_local_image_info
-                if [[ ${LOCAL_MANIFEST_IMAGE_UNAVAILABLE:=} != "true" ]]; then
-                    build_images::compare_layers
-                else
-                    FORCE_PULL_IMAGES="true"
-                fi
+            build_images::get_remote_image_build_cache_hash
+            if [[ ${REMOTE_DOCKER_REGISTRY_UNREACHABLE:=} != "true" && \
+                  ${LOCAL_MANIFEST_IMAGE_UNAVAILABLE:=} != "true" ]]; then
+                    build_images::compare_local_and_remote_build_cache_hash
+            else
+                FORCE_PULL_IMAGES="true"
             fi
         fi
         SKIP_REBUILD="false"
@@ -453,6 +449,7 @@ function build_images::rebuild_ci_image_if_needed() {
             verbosity::print_info "Build start: ${THE_IMAGE_TYPE} image."
             verbosity::print_info
             build_images::build_ci_image
+            build_images::get_local_build_cache_hash
             md5sum::update_all_md5
             build_images::build_ci_image_manifest
             verbosity::print_info
diff --git a/scripts/ci/libraries/_initialization.sh b/scripts/ci/libraries/_initialization.sh
index b58fa2e..396bded 100644
--- a/scripts/ci/libraries/_initialization.sh
+++ b/scripts/ci/libraries/_initialization.sh
@@ -186,9 +186,9 @@ function initialization::initialize_files_for_rebuild_check() {
         "Dockerfile.ci"
         ".dockerignore"
         "airflow/version.py"
-        "airflow/www/package.json"
-        "airflow/www/yarn.lock"
-        "airflow/www/webpack.config.js"
+        "airflow/www_rbac/package.json"
+        "airflow/www_rbac/yarn.lock"
+        "airflow/www_rbac/webpack.config.js"
     )
 }
 
@@ -452,6 +452,12 @@ function initialization::initialize_test_variables() {
     export TEST_TYPE=${TEST_TYPE:=""}
 }
 
+function initialization::initialize_build_image_variables() {
+    REMOTE_IMAGE_CONTAINER_ID_FILE="${AIRFLOW_SOURCES}/manifests/remote-airflow-manifest-image"
+    LOCAL_IMAGE_BUILD_CACHE_HASH_FILE="${AIRFLOW_SOURCES}/manifests/local-build-cache-hash"
+    REMOTE_IMAGE_BUILD_CACHE_HASH_FILE="${AIRFLOW_SOURCES}/manifests/remote-build-cache-hash"
+}
+
 # Common environment that is initialized by both Breeze and CI scripts
 function initialization::initialize_common_environment() {
     initialization::create_directories
@@ -469,6 +475,7 @@ function initialization::initialize_common_environment() {
     initialization::initialize_git_variables
     initialization::initialize_github_variables
     initialization::initialize_test_variables
+    initialization::initialize_build_image_variables
 }
 
 function initialization::set_default_python_version_if_empty() {
@@ -750,6 +757,10 @@ function initialization::make_constants_read_only() {
     readonly BUILT_CI_IMAGE_FLAG_FILE
     readonly INIT_SCRIPT_FILE
 
+    readonly REMOTE_IMAGE_CONTAINER_ID_FILE
+    readonly LOCAL_IMAGE_BUILD_CACHE_HASH_FILE
+    readonly REMOTE_IMAGE_BUILD_CACHE_HASH_FILE
+
 }
 
 # converts parameters to json array
@@ -772,7 +783,7 @@ function initialization::ga_output() {
 
 function initialization::ga_env() {
     if [[ ${GITHUB_ENV=} != "" ]]; then
-        echo "${1}=${2}" >> "${GITHUB_ENV}"
+        echo "${1}=${2}" >>"${GITHUB_ENV}"
     fi
 }