You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by po...@apache.org on 2021/08/02 13:44:14 UTC

[airflow] 19/22: More optimized lazy-loading of provider information (#17304)

This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch v2-1-test
in repository https://gitbox.apache.org/repos/asf/airflow.git

commit e1a3acf6acc0eb20470e44d7698146fd34d86001
Author: Jarek Potiuk <ja...@potiuk.com>
AuthorDate: Thu Jul 29 12:14:58 2021 +0200

    More optimized lazy-loading of provider information (#17304)
    
    With this change we truly lazy-load hooks and external_links only
    when we need them. Previously they were loaded when any of the
    properties of ProvidersManager was used, but with this change
    in some scenarios where only extra links are used or when we
    only need list of providers, but we do not need details on
    which custom hooks are needed, there will be much
    faster initialization. This is mainly for some CLI commands
    (for example `airlfow providers list` is much faster now), but
    also in some scenarios where for example .get_conn() is never
    used in Tasks, tasks might also never need to import/load the hooks
    and they might perform faster, with smaller memory footprint.
    
    (cherry picked from commit 2dc7aa8e7d5c964076500eac2eaac38507da5841)
---
 airflow/__init__.py          |  6 +++--
 airflow/providers_manager.py | 62 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/airflow/__init__.py b/airflow/__init__.py
index a448491..9f9073e 100644
--- a/airflow/__init__.py
+++ b/airflow/__init__.py
@@ -74,11 +74,13 @@ if not settings.LAZY_LOAD_PLUGINS:
 if not settings.LAZY_LOAD_PROVIDERS:
     from airflow import providers_manager
 
-    providers_manager.ProvidersManager().initialize_providers_manager()
+    manager = providers_manager.ProvidersManager()
+    manager.initialize_providers_list()
+    manager.initialize_providers_hooks()
+    manager.initialize_providers_extra_links()
 
 
 # This is never executed, but tricks static analyzers (PyDev, PyCharm,)
-# into knowing the types of these symbols, and what
 # they contain.
 STATICA_HACK = True
 globals()['kcah_acitats'[::-1].upper()] = False
diff --git a/airflow/providers_manager.py b/airflow/providers_manager.py
index 0770f24..5080995 100644
--- a/airflow/providers_manager.py
+++ b/airflow/providers_manager.py
@@ -22,6 +22,7 @@ import json
 import logging
 import os
 from collections import OrderedDict
+from time import perf_counter
 from typing import Any, Dict, NamedTuple, Set
 
 import jsonschema
@@ -29,6 +30,7 @@ from wtforms import Field
 
 from airflow.utils import yaml
 from airflow.utils.entry_points import entry_points_with_dist
+from airflow.utils.log.logging_mixin import LoggingMixin
 
 try:
     import importlib.resources as importlib_resources
@@ -81,7 +83,7 @@ class ConnectionFormWidgetInfo(NamedTuple):
     field: Field
 
 
-class ProvidersManager:
+class ProvidersManager(LoggingMixin):
     """
     Manages all provider packages. This is a Singleton class. The first time it is
     instantiated, it discovers all available providers in installed packages and
@@ -97,6 +99,7 @@ class ProvidersManager:
         return cls._instance
 
     def __init__(self):
+        """Initializes the manager."""
         # Keeps dict of providers keyed by module name
         self._provider_dict: Dict[str, ProviderInfo] = {}
         # Keeps dict of hooks keyed by connection type
@@ -110,30 +113,61 @@ class ProvidersManager:
         self._customized_form_fields_schema_validator = (
             _create_customized_form_field_behaviours_schema_validator()
         )
-        self._initialized = False
+        self._providers_list_initialized = False
+        self._providers_hooks_initialized = False
+        self._providers_extra_links_initialized = False
 
-    def initialize_providers_manager(self):
-        """Lazy initialization of provider data."""
+    def initialize_providers_list(self):
+        """Lazy initialization of providers list."""
         # We cannot use @cache here because it does not work during pytest, apparently each test
         # runs it it's own namespace and ProvidersManager is a different object in each namespace
-        # even if it is singleton but @cache on the initialize_providers_manager message still works in the
+        # even if it is singleton but @cache on the initialize_providers_*  still works in the
         # way that it is called only once for one of the objects (at least this is how it looks like
         # from running tests)
-        if self._initialized:
+        if self._providers_list_initialized:
             return
+        start_time = perf_counter()
+        self.log.debug("Initializing Providers Manager list")
         # Local source folders are loaded first. They should take precedence over the package ones for
         # Development purpose. In production provider.yaml files are not present in the 'airflow" directory
         # So there is no risk we are going to override package provider accidentally. This can only happen
         # in case of local development
         self._discover_all_airflow_builtin_providers_from_local_sources()
         self._discover_all_providers_from_packages()
-        self._discover_hooks()
         self._provider_dict = OrderedDict(sorted(self._provider_dict.items()))
+        self.log.debug(
+            "Initialization of Providers Manager list took %.2f seconds", perf_counter() - start_time
+        )
+        self._providers_list_initialized = True
+
+    def initialize_providers_hooks(self):
+        """Lazy initialization of providers hooks."""
+        if self._providers_hooks_initialized:
+            return
+        self.initialize_providers_list()
+        start_time = perf_counter()
+        self.log.debug("Initializing Providers Hooks")
+        self._discover_hooks()
         self._hooks_dict = OrderedDict(sorted(self._hooks_dict.items()))
         self._connection_form_widgets = OrderedDict(sorted(self._connection_form_widgets.items()))
         self._field_behaviours = OrderedDict(sorted(self._field_behaviours.items()))
+        self.log.debug(
+            "Initialization of Providers Manager hooks took %.2f seconds", perf_counter() - start_time
+        )
+        self._providers_hooks_initialized = True
+
+    def initialize_providers_extra_links(self):
+        """Lazy initialization of providers extra links."""
+        if self._providers_extra_links_initialized:
+            return
+        self.initialize_providers_list()
+        start_time = perf_counter()
+        self.log.debug("Initializing Providers Extra Links")
         self._discover_extra_links()
-        self._initialized = True
+        self.log.debug(
+            "Initialization of Providers Manager extra links took %.2f seconds", perf_counter() - start_time
+        )
+        self._providers_extra_links_initialized = True
 
     def _discover_all_providers_from_packages(self) -> None:
         """
@@ -385,29 +419,29 @@ class ProvidersManager:
     @property
     def providers(self) -> Dict[str, ProviderInfo]:
         """Returns information about available providers."""
-        self.initialize_providers_manager()
+        self.initialize_providers_list()
         return self._provider_dict
 
     @property
     def hooks(self) -> Dict[str, HookInfo]:
         """Returns dictionary of connection_type-to-hook mapping"""
-        self.initialize_providers_manager()
+        self.initialize_providers_hooks()
         return self._hooks_dict
 
     @property
-    def extra_links_class_names(self):
+    def extra_links_class_names(self) -> Set[str]:
         """Returns set of extra link class names."""
-        self.initialize_providers_manager()
+        self.initialize_providers_extra_links()
         return sorted(self._extra_link_class_name_set)
 
     @property
     def connection_form_widgets(self) -> Dict[str, ConnectionFormWidgetInfo]:
         """Returns widgets for connection forms."""
-        self.initialize_providers_manager()
+        self.initialize_providers_hooks()
         return self._connection_form_widgets
 
     @property
     def field_behaviours(self) -> Dict[str, Dict]:
         """Returns dictionary with field behaviours for connection types."""
-        self.initialize_providers_manager()
+        self.initialize_providers_hooks()
         return self._field_behaviours