You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by GitBox <gi...@apache.org> on 2022/02/06 17:35:21 UTC

[GitHub] [airflow] pateash commented on a change in pull request #21363: Databricks SQL operator

pateash commented on a change in pull request #21363:
URL: https://github.com/apache/airflow/pull/21363#discussion_r800209846



##########
File path: airflow/providers/databricks/hooks/databricks.py
##########
@@ -22,25 +22,14 @@
 operators talk to the ``api/2.0/jobs/runs/submit``

Review comment:
       Does it precisely talks to **2.0** or **2.x**,
   I am asking this because 2.1 is already out.

##########
File path: airflow/providers/databricks/hooks/databricks.py
##########
@@ -22,25 +22,14 @@
 operators talk to the ``api/2.0/jobs/runs/submit``

Review comment:
       ```suggestion
   operators talk to the ``api/2.x/jobs/runs/submit``
   ```

##########
File path: airflow/providers/databricks/hooks/databricks_sql.py
##########
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from contextlib import closing
+from copy import copy
+from typing import Any, Dict, List, Optional, Union
+
+from databricks import sql
+from databricks.sql.client import Connection
+
+from airflow.exceptions import AirflowException
+from airflow.hooks.dbapi import DbApiHook
+from airflow.providers.databricks.hooks.databricks_base import DatabricksBaseHook
+
+
+class DatabricksSqlHook(DatabricksBaseHook, DbApiHook):
+    """
+    Interact with Databricks SQL.
+
+    :param databricks_conn_id: Reference to the :ref:`Databricks connection <howto/connection:databricks>`.
+    :param http_path: Optional string specifying HTTP path of Databricks SQL Endpoint or cluster.
+        If not specified, it should be specified in the Databricks connection's extra parameters.
+    :param session_configuration: An optional dictionary of Spark session parameters. Defaults to None.
+        If not specified, it could be specified in the Databricks connection's extra parameters.
+    """
+
+    conn_name_attr = 'databricks_conn_id'

Review comment:
       ```suggestion
   ```

##########
File path: airflow/providers/databricks/hooks/databricks_base.py
##########
@@ -0,0 +1,289 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Databricks hook.
+
+This hook enable the submitting and running of jobs to the Databricks platform. Internally the
+operators talk to the ``api/2.0/jobs/runs/submit``
+`endpoint <https://docs.databricks.com/api/latest/jobs.html#runs-submit>`_.
+"""
+import sys
+import time
+from time import sleep
+from typing import Dict, Optional
+from urllib.parse import urlparse
+
+import requests
+from requests import exceptions as requests_exceptions
+
+from airflow import __version__
+from airflow.exceptions import AirflowException
+from airflow.hooks.base import BaseHook
+from airflow.models import Connection
+
+if sys.version_info >= (3, 8):
+    from functools import cached_property
+else:
+    from cached_property import cached_property
+
+USER_AGENT_HEADER = {'user-agent': f'airflow-{__version__}'}
+
+RUN_LIFE_CYCLE_STATES = ['PENDING', 'RUNNING', 'TERMINATING', 'TERMINATED', 'SKIPPED', 'INTERNAL_ERROR']
+
+# https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/aad/service-prin-aad-token#--get-an-azure-active-directory-access-token
+# https://docs.microsoft.com/en-us/graph/deployments#app-registration-and-token-service-root-endpoints
+AZURE_DEFAULT_AD_ENDPOINT = "https://login.microsoftonline.com"
+AZURE_TOKEN_SERVICE_URL = "{}/{}/oauth2/token"
+# https://docs.microsoft.com/en-us/azure/active-directory/managed-identities-azure-resources/how-to-use-vm-token
+AZURE_METADATA_SERVICE_TOKEN_URL = "http://169.254.169.254/metadata/identity/oauth2/token"
+AZURE_METADATA_SERVICE_INSTANCE_URL = "http://169.254.169.254/metadata/instance"
+
+TOKEN_REFRESH_LEAD_TIME = 120
+AZURE_MANAGEMENT_ENDPOINT = "https://management.core.windows.net/"
+DEFAULT_DATABRICKS_SCOPE = "2ff814a6-3304-4ab8-85cb-cd0e6f879c1d"
+
+
+class DatabricksBaseHook(BaseHook):

Review comment:
       ```suggestion
   class BaseDatabricksHook(BaseHook):
   ```

##########
File path: airflow/providers/databricks/hooks/databricks_sql.py
##########
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from contextlib import closing
+from copy import copy
+from typing import Any, Dict, List, Optional, Union
+
+from databricks import sql
+from databricks.sql.client import Connection
+
+from airflow.exceptions import AirflowException
+from airflow.hooks.dbapi import DbApiHook
+from airflow.providers.databricks.hooks.databricks_base import DatabricksBaseHook
+
+
+class DatabricksSqlHook(DatabricksBaseHook, DbApiHook):
+    """
+    Interact with Databricks SQL.
+
+    :param databricks_conn_id: Reference to the :ref:`Databricks connection <howto/connection:databricks>`.
+    :param http_path: Optional string specifying HTTP path of Databricks SQL Endpoint or cluster.
+        If not specified, it should be specified in the Databricks connection's extra parameters.
+    :param session_configuration: An optional dictionary of Spark session parameters. Defaults to None.
+        If not specified, it could be specified in the Databricks connection's extra parameters.
+    """
+
+    conn_name_attr = 'databricks_conn_id'

Review comment:
       These 3 fields are already present  **databricks_base**,
   I don't think we should have to redefine them here.

##########
File path: airflow/providers/databricks/hooks/databricks_sql.py
##########
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from contextlib import closing
+from copy import copy
+from typing import Any, Dict, List, Optional, Union
+
+from databricks import sql
+from databricks.sql.client import Connection
+
+from airflow.exceptions import AirflowException
+from airflow.hooks.dbapi import DbApiHook
+from airflow.providers.databricks.hooks.databricks_base import DatabricksBaseHook
+
+
+class DatabricksSqlHook(DatabricksBaseHook, DbApiHook):
+    """
+    Interact with Databricks SQL.
+
+    :param databricks_conn_id: Reference to the :ref:`Databricks connection <howto/connection:databricks>`.
+    :param http_path: Optional string specifying HTTP path of Databricks SQL Endpoint or cluster.
+        If not specified, it should be specified in the Databricks connection's extra parameters.
+    :param session_configuration: An optional dictionary of Spark session parameters. Defaults to None.
+        If not specified, it could be specified in the Databricks connection's extra parameters.
+    """
+
+    conn_name_attr = 'databricks_conn_id'
+    default_conn_name = 'databricks_default'

Review comment:
       ```suggestion
   ```

##########
File path: airflow/providers/databricks/hooks/databricks_base.py
##########
@@ -0,0 +1,289 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Databricks hook.
+
+This hook enable the submitting and running of jobs to the Databricks platform. Internally the
+operators talk to the ``api/2.0/jobs/runs/submit``
+`endpoint <https://docs.databricks.com/api/latest/jobs.html#runs-submit>`_.
+"""
+import sys
+import time
+from time import sleep
+from typing import Dict, Optional
+from urllib.parse import urlparse
+
+import requests
+from requests import exceptions as requests_exceptions
+
+from airflow import __version__
+from airflow.exceptions import AirflowException
+from airflow.hooks.base import BaseHook
+from airflow.models import Connection
+
+if sys.version_info >= (3, 8):
+    from functools import cached_property
+else:
+    from cached_property import cached_property
+
+USER_AGENT_HEADER = {'user-agent': f'airflow-{__version__}'}
+
+RUN_LIFE_CYCLE_STATES = ['PENDING', 'RUNNING', 'TERMINATING', 'TERMINATED', 'SKIPPED', 'INTERNAL_ERROR']
+
+# https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/aad/service-prin-aad-token#--get-an-azure-active-directory-access-token
+# https://docs.microsoft.com/en-us/graph/deployments#app-registration-and-token-service-root-endpoints
+AZURE_DEFAULT_AD_ENDPOINT = "https://login.microsoftonline.com"
+AZURE_TOKEN_SERVICE_URL = "{}/{}/oauth2/token"
+# https://docs.microsoft.com/en-us/azure/active-directory/managed-identities-azure-resources/how-to-use-vm-token
+AZURE_METADATA_SERVICE_TOKEN_URL = "http://169.254.169.254/metadata/identity/oauth2/token"
+AZURE_METADATA_SERVICE_INSTANCE_URL = "http://169.254.169.254/metadata/instance"
+
+TOKEN_REFRESH_LEAD_TIME = 120
+AZURE_MANAGEMENT_ENDPOINT = "https://management.core.windows.net/"
+DEFAULT_DATABRICKS_SCOPE = "2ff814a6-3304-4ab8-85cb-cd0e6f879c1d"
+
+
+class DatabricksBaseHook(BaseHook):

Review comment:
       Is it possible to use this Hook directly, if not it should be named
   BaseDatabricksHook, just to be consistent with other base classes.

##########
File path: airflow/providers/databricks/hooks/databricks_sql.py
##########
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from contextlib import closing
+from copy import copy
+from typing import Any, Dict, List, Optional, Union
+
+from databricks import sql
+from databricks.sql.client import Connection
+
+from airflow.exceptions import AirflowException
+from airflow.hooks.dbapi import DbApiHook
+from airflow.providers.databricks.hooks.databricks_base import DatabricksBaseHook
+
+
+class DatabricksSqlHook(DatabricksBaseHook, DbApiHook):
+    """
+    Interact with Databricks SQL.
+
+    :param databricks_conn_id: Reference to the :ref:`Databricks connection <howto/connection:databricks>`.
+    :param http_path: Optional string specifying HTTP path of Databricks SQL Endpoint or cluster.
+        If not specified, it should be specified in the Databricks connection's extra parameters.
+    :param session_configuration: An optional dictionary of Spark session parameters. Defaults to None.
+        If not specified, it could be specified in the Databricks connection's extra parameters.
+    """
+
+    conn_name_attr = 'databricks_conn_id'
+    default_conn_name = 'databricks_default'
+    conn_type = 'databricks'

Review comment:
       ```suggestion
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@airflow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org