You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by po...@apache.org on 2020/06/20 16:53:01 UTC

[airflow] 02/25: Don't use the term "whitelist" - language matters (#9174)

This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch v1-10-test
in repository https://gitbox.apache.org/repos/asf/airflow.git

commit 076b7eea3f226b7561f2f3cdd06b4e22a17fb234
Author: Ash Berlin-Taylor <as...@firemirror.com>
AuthorDate: Mon Jun 8 10:01:46 2020 +0100

    Don't use the term "whitelist" - language matters (#9174)
    
    It's fairly common to say whitelisting and blacklisting to describe
    desirable and undesirable things in cyber security. However just because
    it is common doesn't mean it's right.
    
    However, there's an issue with the terminology. It only makes sense if
    you equate white with 'good, permitted, safe' and black with 'bad,
    dangerous, forbidden'. There are some obvious problems with this.
    
    You may not see why this matters. If you're not adversely affected by
    racial stereotyping yourself, then please count yourself lucky. For some
    of your friends and colleagues (and potential future colleagues), this
    really is a change worth making.
    
    From now on, we will use 'allow list' and 'deny list' in place of
    'whitelist' and 'blacklist' wherever possible. Which, in fact, is
    clearer and less ambiguous. So as well as being more inclusive of all,
    this is a net benefit to our understandability.
    
    (Words mostly borrowed from
    <https://www.ncsc.gov.uk/blog-post/terminology-its-not-black-and-white>)
    
    Co-authored-by: Jarek Potiuk <ja...@potiuk.com>
    (cherry picked from commit 6350fd6ebb9958982cb3fa1d466168fc31708035)
---
 .pre-commit-config.yaml                            | 17 +++++++++++++++
 BREEZE.rst                                         | 12 +++++------
 STATIC_CODE_CHECKS.rst                             |  2 ++
 UPDATING.md                                        | 11 ++++------
 .../config_templates/default_webserver_config.py   |  1 -
 airflow/contrib/plugins/metastore_browser/main.py  | 12 +++++------
 airflow/jobs/scheduler_job.py                      | 18 ++++++++--------
 airflow/operators/hive_stats_operator.py           | 24 +++++++++++++++-------
 airflow/plugins_manager.py                         |  2 +-
 airflow/www/static/underscore.js                   |  4 ++--
 breeze-complete                                    |  1 +
 docs/security.rst                                  |  4 ++--
 tests/contrib/hooks/test_cassandra_hook.py         |  2 +-
 tests/contrib/operators/test_hive_stats.py         |  6 +++---
 tests/utils/test_dag_processing.py                 |  4 ++--
 15 files changed, 73 insertions(+), 47 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 88ab3cc..b1852b7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -228,6 +228,23 @@ repos:
         files: ^setup.py$|^INSTALL$|^CONTRIBUTING.rst$
         pass_filenames: false
         require_serial: true
+      - id: pydevd
+        language: pygrep
+        name: Check for pydevd debug statements accidentally left
+        entry: "pydevd.*settrace\\("
+        pass_filenames: true
+        files: \.py$
+      - id: language-matters
+        language: pygrep
+        name: Check for language that we do not accept as community
+        description: Please use "deny_list" or "allow_list"  instead.
+        entry: "(?i)(black|white)[_-]?list"
+        pass_filenames: true
+        exclude: >
+          (?x)
+          ^airflow/contrib/hooks/cassandra_hook.py$|
+          ^airflow/operators/hive_stats_operator.py$|
+          ^tests/contrib/hooks/test_cassandra_hook.py
       - id: dont-use-safe-filter
         language: pygrep
         name: Don't use safe in templates
diff --git a/BREEZE.rst b/BREEZE.rst
index 3a1a6a0..b2ff795 100644
--- a/BREEZE.rst
+++ b/BREEZE.rst
@@ -1307,9 +1307,9 @@ This is the current syntax for  `./breeze <./breeze>`_:
                  check-executables-have-shebangs check-hooks-apply check-integrations
                  check-merge-conflict check-xml debug-statements detect-private-key doctoc
                  end-of-file-fixer fix-encoding-pragma flake8 forbid-tabs insert-license
-                 lint-dockerfile mixed-line-ending mypy pydevd python2-compile python2-fastcheck
-                 python-no-log-warn rst-backticks setup-order shellcheck trailing-whitespace
-                 update-breeze-file update-extras update-local-yml-file yamllint
+                 language-matters lint-dockerfile mixed-line-ending mypy pydevd python2-compile
+                 python2-fastcheck python-no-log-warn rst-backticks setup-order shellcheck
+                 trailing-whitespace update-breeze-file update-extras update-local-yml-file yamllint
 
         You can pass extra arguments including options to to the pre-commit framework as
         <EXTRA_ARGS> passed after --. For example:
@@ -1337,9 +1337,9 @@ This is the current syntax for  `./breeze <./breeze>`_:
                  check-executables-have-shebangs check-hooks-apply check-integrations
                  check-merge-conflict check-xml debug-statements detect-private-key doctoc
                  end-of-file-fixer fix-encoding-pragma flake8 forbid-tabs insert-license
-                 lint-dockerfile mixed-line-ending mypy pydevd python2-compile python2-fastcheck
-                 python-no-log-warn rst-backticks setup-order shellcheck trailing-whitespace
-                 update-breeze-file update-extras update-local-yml-file yamllint
+                 language-matters lint-dockerfile mixed-line-ending mypy pydevd python2-compile
+                 python2-fastcheck python-no-log-warn rst-backticks setup-order shellcheck
+                 trailing-whitespace update-breeze-file update-extras update-local-yml-file yamllint
 
         You can pass extra arguments including options to the pre-commit framework as
         <EXTRA_ARGS> passed after --. For example:
diff --git a/STATIC_CODE_CHECKS.rst b/STATIC_CODE_CHECKS.rst
index 1a5a6f6..3cb2c5e 100644
--- a/STATIC_CODE_CHECKS.rst
+++ b/STATIC_CODE_CHECKS.rst
@@ -76,6 +76,8 @@ require Breeze Docker images to be installed locally:
 ----------------------------------- ---------------------------------------------------------------- ------------
 ``isort``                             Sorts imports in python files.
 ----------------------------------- ---------------------------------------------------------------- ------------
+``language-matters``                  Check for language that we do not accept as community
+----------------------------------- ---------------------------------------------------------------- ------------
 ``lint-dockerfile``                   Lints a dockerfile.
 ----------------------------------- ---------------------------------------------------------------- ------------
 ``mixed-line-ending``                 Detects if mixed line ending is used (\r vs. \r\n).
diff --git a/UPDATING.md b/UPDATING.md
index 336459b..ea8c262 100644
--- a/UPDATING.md
+++ b/UPDATING.md
@@ -1013,14 +1013,11 @@ dags_are_paused_at_creation = False
 
 If you specify a hive conf to the run_cli command of the HiveHook, Airflow add some
 convenience variables to the config. In case you run a secure Hadoop setup it might be
-required to whitelist these variables by adding the following to your configuration:
+required to allow these variables by adjusting you hive configuration to add `airflow\.ctx\..*` to the regex
+of user-editable configuration properties. See
+[the Hive docs on Configuration Properties][hive.security.authorization.sqlstd] for more info.
 
-```
-<property>
-     <name>hive.security.authorization.sqlstd.confwhitelist.append</name>
-     <value>airflow\.ctx\..*</value>
-</property>
-```
+[hive.security.authorization.sqlstd]: https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=82903061#ConfigurationProperties-SQLStandardBasedAuthorization.1
 
 ### Google Cloud Operator and Hook alignment
 
diff --git a/airflow/config_templates/default_webserver_config.py b/airflow/config_templates/default_webserver_config.py
index bc8a6bb..23d3985 100644
--- a/airflow/config_templates/default_webserver_config.py
+++ b/airflow/config_templates/default_webserver_config.py
@@ -65,7 +65,6 @@ AUTH_TYPE = AUTH_DB
 # Google OAuth example:
 # OAUTH_PROVIDERS = [{
 #   'name':'google',
-#     'whitelist': ['@YOU_COMPANY_DOMAIN'],  # optional
 #     'token_key':'access_token',
 #     'icon':'fa-google',
 #         'remote_app': {
diff --git a/airflow/contrib/plugins/metastore_browser/main.py b/airflow/contrib/plugins/metastore_browser/main.py
index 6826030..d5a68a2 100644
--- a/airflow/contrib/plugins/metastore_browser/main.py
+++ b/airflow/contrib/plugins/metastore_browser/main.py
@@ -35,8 +35,8 @@ METASTORE_MYSQL_CONN_ID = 'metastore_mysql'
 PRESTO_CONN_ID = 'presto_default'
 HIVE_CLI_CONN_ID = 'hive_default'
 DEFAULT_DB = 'default'
-DB_WHITELIST = None
-DB_BLACKLIST = ['tmp']
+DB_ALLOW_LIST = None
+DB_DENY_LIST = ['tmp']
 TABLE_SELECTOR_LIMIT = 2000
 
 # Keeping pandas from truncating long strings
@@ -118,11 +118,11 @@ class MetastoreBrowserView(BaseView, wwwutils.DataProfilingMixin):
     @expose('/objects/')
     def objects(self):
         where_clause = ''
-        if DB_WHITELIST:
-            dbs = ",".join(["'" + db + "'" for db in DB_WHITELIST])
+        if DB_ALLOW_LIST:
+            dbs = ",".join(["'" + db + "'" for db in DB_ALLOW_LIST])
             where_clause = "AND b.name IN ({})".format(dbs)
-        if DB_BLACKLIST:
-            dbs = ",".join(["'" + db + "'" for db in DB_BLACKLIST])
+        if DB_DENY_LIST:
+            dbs = ",".join(["'" + db + "'" for db in DB_DENY_LIST])
             where_clause = "AND b.name NOT IN ({})".format(dbs)
         sql = """
         SELECT CONCAT(b.NAME, '.', a.TBL_NAME), TBL_TYPE
diff --git a/airflow/jobs/scheduler_job.py b/airflow/jobs/scheduler_job.py
index 1dd986e..e142dc5 100644
--- a/airflow/jobs/scheduler_job.py
+++ b/airflow/jobs/scheduler_job.py
@@ -68,8 +68,8 @@ class DagFileProcessor(AbstractDagFileProcessor, LoggingMixin):
     :type file_path: unicode
     :param pickle_dags: whether to serialize the DAG objects to the DB
     :type pickle_dags: bool
-    :param dag_id_white_list: If specified, only look at these DAG ID's
-    :type dag_id_white_list: list[unicode]
+    :param dag_ids: If specified, only look at these DAG ID's
+    :type dag_ids: list[unicode]
     :param zombies: zombie task instances to kill
     :type zombies: list[airflow.utils.dag_processing.SimpleTaskInstance]
     """
@@ -77,12 +77,12 @@ class DagFileProcessor(AbstractDagFileProcessor, LoggingMixin):
     # Counter that increments every time an instance of this class is created
     class_creation_counter = 0
 
-    def __init__(self, file_path, pickle_dags, dag_id_white_list, zombies):
+    def __init__(self, file_path, pickle_dags, dag_ids, zombies):
         self._file_path = file_path
 
         # The process that was launched to process the given .
         self._process = None
-        self._dag_id_white_list = dag_id_white_list
+        self._dag_ids = dag_ids
         self._pickle_dags = pickle_dags
         self._zombies = zombies
         # The result of Scheduler.process_file(file_path).
@@ -104,7 +104,7 @@ class DagFileProcessor(AbstractDagFileProcessor, LoggingMixin):
     def _run_file_processor(result_channel,
                             file_path,
                             pickle_dags,
-                            dag_id_white_list,
+                            dag_ids,
                             thread_name,
                             zombies):
         """
@@ -117,9 +117,9 @@ class DagFileProcessor(AbstractDagFileProcessor, LoggingMixin):
         :param pickle_dags: whether to pickle the DAGs found in the file and
             save them to the DB
         :type pickle_dags: bool
-        :param dag_id_white_list: if specified, only examine DAG ID's that are
+        :param dag_ids: if specified, only examine DAG ID's that are
             in this list
-        :type dag_id_white_list: list[unicode]
+        :type dag_ids: list[unicode]
         :param thread_name: the name to use for the process that is launched
         :type thread_name: unicode
         :param zombies: zombie task instances to kill
@@ -152,7 +152,7 @@ class DagFileProcessor(AbstractDagFileProcessor, LoggingMixin):
 
             log.info("Started process (PID=%s) to work on %s",
                      os.getpid(), file_path)
-            scheduler_job = SchedulerJob(dag_ids=dag_id_white_list, log=log)
+            scheduler_job = SchedulerJob(dag_ids=dag_ids, log=log)
             result = scheduler_job.process_file(file_path,
                                                 zombies,
                                                 pickle_dags)
@@ -184,7 +184,7 @@ class DagFileProcessor(AbstractDagFileProcessor, LoggingMixin):
                 _child_channel,
                 self.file_path,
                 self._pickle_dags,
-                self._dag_id_white_list,
+                self._dag_ids,
                 "DagFileProcessor{}".format(self._instance_id),
                 self._zombies
             ),
diff --git a/airflow/operators/hive_stats_operator.py b/airflow/operators/hive_stats_operator.py
index fe8ff9d..8648845 100644
--- a/airflow/operators/hive_stats_operator.py
+++ b/airflow/operators/hive_stats_operator.py
@@ -20,6 +20,7 @@
 from builtins import zip
 from collections import OrderedDict
 import json
+import warnings
 
 from airflow.exceptions import AirflowException
 from airflow.hooks.mysql_hook import MySqlHook
@@ -49,9 +50,9 @@ class HiveStatsCollectionOperator(BaseOperator):
     :param extra_exprs: dict of expression to run against the table where
         keys are metric names and values are Presto compatible expressions
     :type extra_exprs: dict
-    :param col_blacklist: list of columns to blacklist, consider
-        blacklisting blobs, large json columns, ...
-    :type col_blacklist: list
+    :param excluded_columns: list of columns to exclude, consider
+        excluding blobs, large json columns, ...
+    :type excluded_columns: list
     :param assignment_func: a function that receives a column name and
         a type, and returns a dict of metric names and an Presto expressions.
         If None is returned, the global defaults are applied. If an
@@ -69,18 +70,27 @@ class HiveStatsCollectionOperator(BaseOperator):
             table,
             partition,
             extra_exprs=None,
-            col_blacklist=None,
+            excluded_columns=None,
             assignment_func=None,
             metastore_conn_id='metastore_default',
             presto_conn_id='presto_default',
             mysql_conn_id='airflow_db',
             *args, **kwargs):
-        super(HiveStatsCollectionOperator, self).__init__(*args, **kwargs)
+        if 'col_blacklist' in kwargs:
+            warnings.warn(
+                'col_blacklist kwarg passed to {c} (task_id: {t}) is deprecated, please rename it to '
+                'excluded_columns instead'.format(
+                    c=self.__class__.__name__, t=kwargs.get('task_id')),
+                category=FutureWarning,
+                stacklevel=2
+            )
+            excluded_columns = kwargs.pop('col_blacklist')
 
+        super(HiveStatsCollectionOperator, self).__init__(*args, **kwargs)
         self.table = table
         self.partition = partition
         self.extra_exprs = extra_exprs or {}
-        self.col_blacklist = col_blacklist or {}
+        self.excluded_columns = excluded_columns or {}
         self.metastore_conn_id = metastore_conn_id
         self.presto_conn_id = presto_conn_id
         self.mysql_conn_id = mysql_conn_id
@@ -89,7 +99,7 @@ class HiveStatsCollectionOperator(BaseOperator):
         self.dttm = '{{ execution_date.isoformat() }}'
 
     def get_default_exprs(self, col, col_type):
-        if col in self.col_blacklist:
+        if col in self.excluded_columns:
             return {}
         d = {(col, 'non_null'): "COUNT({col})"}
         if col_type in ['double', 'int', 'bigint', 'float', 'double']:
diff --git a/airflow/plugins_manager.py b/airflow/plugins_manager.py
index 317cbde..d68f882 100644
--- a/airflow/plugins_manager.py
+++ b/airflow/plugins_manager.py
@@ -114,7 +114,7 @@ def register_inbuilt_operator_links():
     Register all the Operators Links that are already defined for the operators
     in the "airflow" project. Example: QDSLink (Operator Link for Qubole Operator)
 
-    This is required to populate the "whitelist" of allowed classes when deserializing operator links
+    This is required to populate the "allowed list" of allowed classes when deserializing operator links
     """
     inbuilt_operator_links = set()  # type: Set[Type]
 
diff --git a/airflow/www/static/underscore.js b/airflow/www/static/underscore.js
index 70fae3f..7252539 100644
--- a/airflow/www/static/underscore.js
+++ b/airflow/www/static/underscore.js
@@ -806,7 +806,7 @@
     return obj;
   };
 
-  // Return a copy of the object only containing the whitelisted properties.
+  // Return a copy of the object only containing the allowed list properties.
   _.pick = function(obj) {
     var copy = {};
     var keys = concat.apply(ArrayProto, slice.call(arguments, 1));
@@ -816,7 +816,7 @@
     return copy;
   };
 
-   // Return a copy of the object without the blacklisted properties.
+   // Return a copy of the object without the disallowed properties.
   _.omit = function(obj) {
     var copy = {};
     var keys = concat.apply(ArrayProto, slice.call(arguments, 1));
diff --git a/breeze-complete b/breeze-complete
index d1df5b9..5d8a724 100644
--- a/breeze-complete
+++ b/breeze-complete
@@ -60,6 +60,7 @@ fix-encoding-pragma
 flake8
 forbid-tabs
 insert-license
+language-matters
 lint-dockerfile
 mixed-line-ending
 mypy
diff --git a/docs/security.rst b/docs/security.rst
index e6bced8..0e49885 100644
--- a/docs/security.rst
+++ b/docs/security.rst
@@ -322,7 +322,7 @@ GitHub Enterprise (GHE) Authentication
 
 The GitHub Enterprise authentication backend can be used to authenticate users
 against an installation of GitHub Enterprise using OAuth2. You can optionally
-specify a team whitelist (composed of slug cased team names) to restrict login
+specify a team allowed list (composed of slug cased team names) to restrict login
 to only members of those teams.
 
 .. code-block:: bash
@@ -338,7 +338,7 @@ to only members of those teams.
     oauth_callback_route = /example/ghe_oauth/callback
     allowed_teams = 1, 345, 23
 
-.. note:: If you do not specify a team whitelist, anyone with a valid account on
+.. note:: If you do not specify a team allowed list, anyone with a valid account on
    your GHE installation will be able to login to Airflow.
 
 To use GHE authentication, you must install Airflow with the ``github_enterprise`` extras group:
diff --git a/tests/contrib/hooks/test_cassandra_hook.py b/tests/contrib/hooks/test_cassandra_hook.py
index a798152..b8c120b 100644
--- a/tests/contrib/hooks/test_cassandra_hook.py
+++ b/tests/contrib/hooks/test_cassandra_hook.py
@@ -113,7 +113,7 @@ class TestCassandraHook(unittest.TestCase):
                                    TokenAwarePolicy,
                                    expected_child_policy_type=RoundRobinPolicy)
 
-    def test_get_lb_policy_no_host_for_white_list(self):
+    def test_get_lb_policy_no_host_for_allow_list(self):
         # test host not specified for WhiteListRoundRobinPolicy should throw exception
         self._assert_get_lb_policy('WhiteListRoundRobinPolicy',
                                    {},
diff --git a/tests/contrib/operators/test_hive_stats.py b/tests/contrib/operators/test_hive_stats.py
index 7df8e35..69c9b74 100644
--- a/tests/contrib/operators/test_hive_stats.py
+++ b/tests/contrib/operators/test_hive_stats.py
@@ -60,9 +60,9 @@ class TestHiveStatsCollectionOperator(TestHiveEnvironment):
             (col, 'non_null'): 'COUNT({})'.format(col)
         })
 
-    def test_get_default_exprs_blacklist(self):
-        col = 'blacklisted_col'
-        self.kwargs.update(dict(col_blacklist=[col]))
+    def test_get_default_exprs_excluded_cols(self):
+        col = 'excluded_col'
+        self.kwargs.update(dict(excluded_columns=[col]))
 
         default_exprs = HiveStatsCollectionOperator(**self.kwargs).get_default_exprs(col, None)
 
diff --git a/tests/utils/test_dag_processing.py b/tests/utils/test_dag_processing.py
index de8f06c..2232310 100644
--- a/tests/utils/test_dag_processing.py
+++ b/tests/utils/test_dag_processing.py
@@ -246,9 +246,9 @@ class TestDagFileProcessorManager(unittest.TestCase):
             class FakeDagFIleProcessor(DagFileProcessor):
                 # This fake processor will return the zombies it received in constructor
                 # as its processing result w/o actually parsing anything.
-                def __init__(self, file_path, pickle_dags, dag_id_white_list, zombies):
+                def __init__(self, file_path, pickle_dags, dag_ids, zombies):
                     super(FakeDagFIleProcessor, self).__init__(
-                        file_path, pickle_dags, dag_id_white_list, zombies
+                        file_path, pickle_dags, dag_ids, zombies
                     )
 
                     self._result = zombies, 0