You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by po...@apache.org on 2023/08/25 15:54:51 UTC

[airflow] 04/05: Improve detection of when breeze CI image needs rebuilding (#33603)

This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch v2-7-test
in repository https://gitbox.apache.org/repos/asf/airflow.git

commit 6005da9ba5b1bb7ce004a69a6d8be75d43e12a53
Author: Jarek Potiuk <ja...@potiuk.com>
AuthorDate: Tue Aug 22 13:27:03 2023 +0200

    Improve detection of when breeze CI image needs rebuilding (#33603)
    
    * Improve detection of when breeze CI image needs rebuilding
    
    Previously we have been using provider.yaml file modification as
    a sign that the docker image needs rebuilding when starting image.
    However just modification of provider.yaml file is not a sign
    that the image needs rebuilding. The image needs rebuilding when
    provider dependencies changed, but there are many more reasons why
    provider.yaml file changed - especially recently provider.yaml
    file contains much more information and dependencies are only part
    of it. Provider.yaml files can also be modified by release manager
    wnen documentation is prepared, but none of the documentation
    change is a reason for rebuilding the image.
    
    This PR optimize the check for image building introducing two
    step process:
    
    * first we check if provider.yaml files changed
    * if they did, we regenerate provider dependencies by manully
      running the pre-commit script
    * then provider_dependencies.json is used instead of all providers
      to determine if the image needs rebuilding
    
    This has several nice side effects:
    
    * the list of files that have been modified displayed to the
      user is potentially much smaller (no provider.yaml files)
    * provider_dependencies.json is regenereated automatically when
      you run any breeze command, which means that you do not have
      to have pre-commit installed to regenerate it
    * the notification "image needs rebuilding" will be printed less
      frequently to the user - only when it is really needed
    * preparing provider documentation in CI will not trigger
      image rebuilding (which might occasionally fail in such case
      especially when we bring back a provider from long suspension
      like it happened in #33574
    
    * Update dev/breeze/src/airflow_breeze/commands/developer_commands.py
    
    (cherry picked from commit ac0d5b3dbe731605af38018ce7ce970ffded539a)
---
 .../airflow_breeze/commands/ci_image_commands.py   |  6 +-
 .../airflow_breeze/commands/developer_commands.py  |  3 +
 dev/breeze/src/airflow_breeze/global_constants.py  |  2 +-
 .../src/airflow_breeze/params/build_ci_params.py   |  1 +
 .../src/airflow_breeze/params/shell_params.py      |  1 +
 .../src/airflow_breeze/utils/md5_build_check.py    | 72 +++++++++++++++++-----
 .../pre_commit_update_providers_dependencies.py    | 34 +++++++---
 7 files changed, 93 insertions(+), 26 deletions(-)

diff --git a/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py b/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py
index 124d6ca318..2eecbfff3c 100644
--- a/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py
+++ b/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py
@@ -455,7 +455,10 @@ def should_we_run_the_build(build_ci_params: BuildCiParams) -> bool:
     # We import those locally so that click autocomplete works
     from inputimeout import TimeoutOccurred
 
-    if not md5sum_check_if_build_is_needed(md5sum_cache_dir=build_ci_params.md5sum_cache_dir):
+    if not md5sum_check_if_build_is_needed(
+        md5sum_cache_dir=build_ci_params.md5sum_cache_dir,
+        skip_provider_dependencies_check=build_ci_params.skip_provider_dependencies_check,
+    ):
         return False
     try:
         answer = user_confirm(
@@ -631,6 +634,7 @@ def rebuild_or_pull_ci_image_if_needed(command_params: ShellParams | BuildCiPara
         image_tag=command_params.image_tag,
         platform=command_params.platform,
         force_build=command_params.force_build,
+        skip_provider_dependencies_check=command_params.skip_provider_dependencies_check,
     )
     if command_params.image_tag is not None and command_params.image_tag != "latest":
         return_code, message = run_pull_image(
diff --git a/dev/breeze/src/airflow_breeze/commands/developer_commands.py b/dev/breeze/src/airflow_breeze/commands/developer_commands.py
index f29584bb32..4a694cac2e 100644
--- a/dev/breeze/src/airflow_breeze/commands/developer_commands.py
+++ b/dev/breeze/src/airflow_breeze/commands/developer_commands.py
@@ -516,6 +516,9 @@ def static_checks(
         force_build=force_build,
         image_tag=image_tag,
         github_repository=github_repository,
+        # for static checks we do not want to regenerate dependencies before pre-commits are run
+        # we want the pre-commit to do it for us (and detect the case the dependencies are updated)
+        skip_provider_dependencies_check=True,
     )
     if not skip_image_check:
         rebuild_or_pull_ci_image_if_needed(command_params=build_params)
diff --git a/dev/breeze/src/airflow_breeze/global_constants.py b/dev/breeze/src/airflow_breeze/global_constants.py
index c9a9066f07..d79a6561d2 100644
--- a/dev/breeze/src/airflow_breeze/global_constants.py
+++ b/dev/breeze/src/airflow_breeze/global_constants.py
@@ -305,13 +305,13 @@ FILES_FOR_REBUILD_CHECK = [
     "setup.cfg",
     "Dockerfile.ci",
     ".dockerignore",
+    "generated/provider_dependencies.json",
     "scripts/docker/common.sh",
     "scripts/docker/install_additional_dependencies.sh",
     "scripts/docker/install_airflow.sh",
     "scripts/docker/install_airflow_dependencies_from_branch_tip.sh",
     "scripts/docker/install_from_docker_context_files.sh",
     "scripts/docker/install_mysql.sh",
-    *ALL_PROVIDER_YAML_FILES,
 ]
 
 ENABLED_SYSTEMS = ""
diff --git a/dev/breeze/src/airflow_breeze/params/build_ci_params.py b/dev/breeze/src/airflow_breeze/params/build_ci_params.py
index 8888a7398f..4ad0b82789 100644
--- a/dev/breeze/src/airflow_breeze/params/build_ci_params.py
+++ b/dev/breeze/src/airflow_breeze/params/build_ci_params.py
@@ -37,6 +37,7 @@ class BuildCiParams(CommonBuildParams):
     airflow_pre_cached_pip_packages: bool = True
     force_build: bool = False
     eager_upgrade_additional_requirements: str = ""
+    skip_provider_dependencies_check: bool = False
 
     @property
     def airflow_version(self):
diff --git a/dev/breeze/src/airflow_breeze/params/shell_params.py b/dev/breeze/src/airflow_breeze/params/shell_params.py
index c8dfd9cd84..405fc7bcfb 100644
--- a/dev/breeze/src/airflow_breeze/params/shell_params.py
+++ b/dev/breeze/src/airflow_breeze/params/shell_params.py
@@ -125,6 +125,7 @@ class ShellParams:
     celery_flower: bool = False
     only_min_version_update: bool = False
     regenerate_missing_docs: bool = False
+    skip_provider_dependencies_check: bool = False
 
     def clone_with_test(self, test_type: str) -> ShellParams:
         new_params = deepcopy(self)
diff --git a/dev/breeze/src/airflow_breeze/utils/md5_build_check.py b/dev/breeze/src/airflow_breeze/utils/md5_build_check.py
index 4397fece4d..54b46c9916 100644
--- a/dev/breeze/src/airflow_breeze/utils/md5_build_check.py
+++ b/dev/breeze/src/airflow_breeze/utils/md5_build_check.py
@@ -20,11 +20,14 @@ Utilities to check - with MD5 - whether files have been modified since the last
 from __future__ import annotations
 
 import hashlib
+import os
+import sys
 from pathlib import Path
 
-from airflow_breeze.global_constants import FILES_FOR_REBUILD_CHECK
+from airflow_breeze.global_constants import ALL_PROVIDER_YAML_FILES, FILES_FOR_REBUILD_CHECK
 from airflow_breeze.utils.console import get_console
 from airflow_breeze.utils.path_utils import AIRFLOW_SOURCES_ROOT
+from airflow_breeze.utils.run_utils import run_command
 
 
 def check_md5checksum_in_cache_modified(file_hash: str, cache_path: Path, update: bool) -> bool:
@@ -59,8 +62,19 @@ def generate_md5(filename, file_size: int = 65536):
     return hash_md5.hexdigest()
 
 
+def check_md5_sum_for_file(file_to_check: str, md5sum_cache_dir: Path, update: bool):
+    file_to_get_md5 = AIRFLOW_SOURCES_ROOT / file_to_check
+    md5_checksum = generate_md5(file_to_get_md5)
+    sub_dir_name = file_to_get_md5.parts[-2]
+    actual_file_name = file_to_get_md5.parts[-1]
+    cache_file_name = Path(md5sum_cache_dir, sub_dir_name + "-" + actual_file_name + ".md5sum")
+    file_content = md5_checksum + "  " + str(file_to_get_md5) + "\n"
+    is_modified = check_md5checksum_in_cache_modified(file_content, cache_file_name, update=update)
+    return is_modified
+
+
 def calculate_md5_checksum_for_files(
-    md5sum_cache_dir: Path, update: bool = False
+    md5sum_cache_dir: Path, update: bool = False, skip_provider_dependencies_check: bool = False
 ) -> tuple[list[str], list[str]]:
     """
     Calculates checksums for all interesting files and stores the hashes in the md5sum_cache_dir.
@@ -68,36 +82,64 @@ def calculate_md5_checksum_for_files(
 
     :param md5sum_cache_dir: directory where to store cached information
     :param update: whether to update the hashes
+    :param skip_provider_dependencies_check: whether to skip regeneration of the provider dependencies
     :return: Tuple of two lists: modified and not-modified files
     """
     not_modified_files = []
     modified_files = []
-    for calculate_md5_file in FILES_FOR_REBUILD_CHECK:
-        file_to_get_md5 = AIRFLOW_SOURCES_ROOT / calculate_md5_file
-        md5_checksum = generate_md5(file_to_get_md5)
-        sub_dir_name = file_to_get_md5.parts[-2]
-        actual_file_name = file_to_get_md5.parts[-1]
-        cache_file_name = Path(md5sum_cache_dir, sub_dir_name + "-" + actual_file_name + ".md5sum")
-        file_content = md5_checksum + "  " + str(file_to_get_md5) + "\n"
-        is_modified = check_md5checksum_in_cache_modified(file_content, cache_file_name, update=update)
+    if not skip_provider_dependencies_check:
+        modified_provider_yaml_files = []
+        for file in ALL_PROVIDER_YAML_FILES:
+            # Only check provider yaml files once and save the result immediately.
+            # If we need to regenerate the dependencies and they are not modified then
+            # all is fine and we can save checksums for the new files
+            if check_md5_sum_for_file(file, md5sum_cache_dir, True):
+                modified_provider_yaml_files.append(file)
+        if modified_provider_yaml_files:
+            get_console().print(
+                "[info]Attempting to generate provider dependencies. "
+                "Provider yaml files changed since last check:[/]"
+            )
+            get_console().print(
+                [os.fspath(file.relative_to(AIRFLOW_SOURCES_ROOT)) for file in modified_provider_yaml_files]
+            )
+            # Regenerate provider_dependencies.json
+            run_command(
+                [
+                    sys.executable,
+                    os.fspath(
+                        AIRFLOW_SOURCES_ROOT
+                        / "scripts"
+                        / "ci"
+                        / "pre_commit"
+                        / "pre_commit_update_providers_dependencies.py"
+                    ),
+                ],
+                cwd=AIRFLOW_SOURCES_ROOT,
+            )
+    for file in FILES_FOR_REBUILD_CHECK:
+        is_modified = check_md5_sum_for_file(file, md5sum_cache_dir, update)
         if is_modified:
-            modified_files.append(calculate_md5_file)
+            modified_files.append(file)
         else:
-            not_modified_files.append(calculate_md5_file)
+            not_modified_files.append(file)
     return modified_files, not_modified_files
 
 
-def md5sum_check_if_build_is_needed(md5sum_cache_dir: Path) -> bool:
+def md5sum_check_if_build_is_needed(md5sum_cache_dir: Path, skip_provider_dependencies_check: bool) -> bool:
     """
     Checks if build is needed based on whether important files were modified.
 
     :param md5sum_cache_dir: directory where cached md5 sums are stored
+    :param skip_provider_dependencies_check: whether to skip regeneration of the provider dependencies
 
     :return: True if build is needed.
     """
     build_needed = False
-    modified_files, not_modified_files = calculate_md5_checksum_for_files(md5sum_cache_dir, update=False)
-    if len(modified_files) > 0:
+    modified_files, not_modified_files = calculate_md5_checksum_for_files(
+        md5sum_cache_dir, update=False, skip_provider_dependencies_check=skip_provider_dependencies_check
+    )
+    if modified_files:
         get_console().print(
             f"[warning]The following important files are modified in {AIRFLOW_SOURCES_ROOT} "
             f"since last time image was built: [/]\n\n"
diff --git a/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py b/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py
index 0c489bad63..c8ea48ec48 100755
--- a/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py
+++ b/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py
@@ -209,12 +209,28 @@ if __name__ == "__main__":
         console.print("[red]Errors found during verification. Exiting!")
         console.print()
         sys.exit(1)
-    DEPENDENCIES_JSON_FILE_PATH.write_text(json.dumps(unique_sorted_dependencies, indent=2) + "\n")
-    console.print(
-        f"[yellow]If you see changes to the {DEPENDENCIES_JSON_FILE_PATH} file - "
-        f"do not modify the file manually. Let pre-commit do the job!"
-    )
-    console.print()
-    console.print("[green]Verification complete! Success!\n")
-    console.print(f"Written {DEPENDENCIES_JSON_FILE_PATH}")
-    console.print()
+    old_dependencies = DEPENDENCIES_JSON_FILE_PATH.read_text()
+    new_dependencies = json.dumps(unique_sorted_dependencies, indent=2) + "\n"
+    if new_dependencies != old_dependencies:
+        DEPENDENCIES_JSON_FILE_PATH.write_text(json.dumps(unique_sorted_dependencies, indent=2) + "\n")
+        if os.environ.get("CI"):
+            console.print()
+            console.print(f"[info]Written {DEPENDENCIES_JSON_FILE_PATH}")
+            console.print(
+                f"[yellow]You will need to run breeze locally and commit "
+                f"{DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)}!\n"
+            )
+            console.print()
+        else:
+            console.print()
+            console.print(
+                f"[yellow]Regenerated new dependencies. Please commit "
+                f"{DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)}!\n"
+            )
+            console.print(f"[info]Written {DEPENDENCIES_JSON_FILE_PATH}")
+            console.print()
+    else:
+        console.print(
+            "[green]No need to regenerate dependencies!\n[/]"
+            f"The {DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)} is up to date!\n"
+        )