You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by je...@apache.org on 2022/08/16 15:35:08 UTC

[airflow] 01/11: [astro] [AIRFLOW-5448] Handle istio-proxy for Kubernetes Pods (#62)

This is an automated email from the ASF dual-hosted git repository.

jedcunningham pushed a commit to tag v2.3.3+astro.2
in repository https://gitbox.apache.org/repos/asf/airflow.git

commit 78ffc8b9c6594fb21bd6c4a1b03277a2b1f36b6b
Author: sjmiller609 <sj...@gmail.com>
AuthorDate: Wed Sep 11 22:39:02 2019 -0400

    [astro] [AIRFLOW-5448] Handle istio-proxy for Kubernetes Pods (#62)
    
    Istio service mesh is not compatible by default with Kubernetes Jobs.
    The normal behavior is that a Job will be started, get an istio-proxy
    sidecar attached to it via the istio mutating webhook, run until
    completion, then the 'main' container in the pod stops, but istio-proxy
    hangs around indefinitely. This change handles cleanly exiting the
    Istio sidecar 'istio-proxy' when a Kubernetes Executor task completes.
    
    (cherry picked from commit 84fa48f53a66a8ccf2b7b2910be49a69b697a6c5)
    (cherry picked from commit 6ed59bfe865c3e841d0cef5a99875f2987df4b75)
    (cherry picked from commit ba60eded479dc16232978758a3c47b7be3db5284)
    (cherry picked from commit 80ac2187dc5bf6cb462a4e10bb19755e09b90d31)
    
    Handle Istio containers with Kubernetes Executor Pod adoption (#1318)
    
    closes https://github.com/astronomer/issues/issues/3030
    
    >This edge case deals specifically with task instances that ended in the UP_FOR_RETRY state when a scheduler is adopting orphaned task. Generally, this issue does not affec OSS Airflow since the template kubernetes worker pods spawned doesn't have additional containers that would prevent the pod from going into the Succeeded pod state. Those pods in the Succeeded state are handled by the scheduler's adoption process in _adopt_completed_pods().
    
    Since Astronomer's kubernetes worker pods have an additional container (istio-proxy), they are in the NotReady state when tasks are not killed and they are not eligible for adoption.
    
    This can also happen for "completed" pods that have sidecars. Same process though, just a slightly different scenario: If a worker finishes while not being watched by a scheduler, it never gets adopted by another scheduler in _adopt_completed_pods() as the pod is still 'Running', but the TI also isn't in a resettable state so scheduler_job never asks the executor to adopt it! It's in limbo - "complete" in Airflows view (based on TI state) but "Running" in k8s view (since the sidecar i [...]
    
    This commit re-uses current Istio code and handles those pods.
    
    (cherry picked from commit 3f309b057a9cfb59293c130dedff400e3e1a9a52)
    (cherry picked from commit 58cfc68bf6e938fa8d586b49a898200d97abae00)
    (cherry picked from commit 92a8289cabfcb8ec89bff53d4b1e71b6892b066d)
    
    [astro] Fix istio sidecar shutdown on newer GKE
    
    Newer GKE verions have started to emit multiple running events for a
    given pod with the sidecar still being shown as running. We will put
    retries around shutting down the sidecar and also check the current
    status of the sidecar, not just the status at the time of the event.
    
    e.g: GKE > 1.18.20.901
    
    (cherry picked from commit cbd50ef0a38541bdd85055517012a48a5ebe9e2c)
    (cherry picked from commit d1025e1063612995df13e887e0b70cad0c580a16)
    (cherry picked from commit d56ba747a8b7263d0bfe83e3ac46b77a4ec0d113)
    (cherry picked from commit 11a80aede0d1b51e6c424e45805ef3b36d1debaf)
    (cherry picked from commit 1f0e8bea4bb2656c0523a2f177a4dbf5b26ba48e)
    (cherry picked from commit 20b0bad4595cf45d52dcad32621ccd93506e12c7)
    (cherry picked from commit 102efe2282a82242f3f561519096481a3edbf018)
    (cherry picked from commit 765cc50c7e3aca74d62a2e6beeea80c4d1174e23)
    (cherry picked from commit 7f7e45434b28d3b97a6f76d273db715f51f9618f)
    (cherry picked from commit 3026483ff2a6ad9c630a1b21985594548cffc98b)
---
 airflow/executors/kubernetes_executor.py |  20 ++++
 airflow/kubernetes/istio.py              | 170 +++++++++++++++++++++++++++++++
 setup.py                                 |   1 +
 tests/kubernetes/test_istio.py           | 119 ++++++++++++++++++++++
 4 files changed, 310 insertions(+)

diff --git a/airflow/executors/kubernetes_executor.py b/airflow/executors/kubernetes_executor.py
index e510da2b31..9d15f1ab2d 100644
--- a/airflow/executors/kubernetes_executor.py
+++ b/airflow/executors/kubernetes_executor.py
@@ -38,6 +38,7 @@ from urllib3.exceptions import ReadTimeoutError
 from airflow.exceptions import AirflowException, PodReconciliationError
 from airflow.executors.base_executor import NOT_STARTED_MESSAGE, BaseExecutor, CommandType
 from airflow.kubernetes import pod_generator
+from airflow.kubernetes.istio import Istio
 from airflow.kubernetes.kube_client import get_kube_client
 from airflow.kubernetes.kube_config import KubeConfig
 from airflow.kubernetes.kubernetes_helper_functions import annotations_to_key, create_pod_id
@@ -91,6 +92,7 @@ class KubernetesJobWatcher(multiprocessing.Process, LoggingMixin):
         self.watcher_queue = watcher_queue
         self.resource_version = resource_version
         self.kube_config = kube_config
+        self.istio = Istio(get_kube_client())
 
     def run(self) -> None:
         """Performs watching"""
@@ -170,6 +172,7 @@ class KubernetesJobWatcher(multiprocessing.Process, LoggingMixin):
                 event=event,
             )
             last_resource_version = task.metadata.resource_version
+            self.istio.handle_istio_proxy(task)
 
         return last_resource_version
 
@@ -711,6 +714,7 @@ class KubernetesExecutor(BaseExecutor):
             for pod in pod_list.items:
                 self.adopt_launched_task(kube_client, pod, pod_ids)
         self._adopt_completed_pods(kube_client)
+        self._handle_zombied_istio_pods(kube_client)
         tis_to_flush.extend(pod_ids.values())
         return tis_to_flush
 
@@ -770,6 +774,22 @@ class KubernetesExecutor(BaseExecutor):
             except ApiException as e:
                 self.log.info("Failed to adopt pod %s. Reason: %s", pod.metadata.name, e)
 
+    def _handle_zombied_istio_pods(self, kube_client: client.CoreV1Api) -> None:
+        """
+        Handle Zombied pods that are caused because istio container is still running,
+        while base container (where Airflow task is run) is completed.
+
+        :param kube_client: kubernetes client for speaking to kube API
+        """
+        kwargs = {
+            'field_selector': "status.phase=Running",
+            'label_selector': 'kubernetes_executor=True',
+        }
+        pod_list = kube_client.list_namespaced_pod(namespace=self.kube_config.kube_namespace, **kwargs)
+        istio = Istio(kube_client=self.kube_client)
+        for pod in pod_list.items:
+            istio.handle_istio_proxy(pod)
+
     def _flush_task_queue(self) -> None:
         if not self.task_queue:
             raise AirflowException(NOT_STARTED_MESSAGE)
diff --git a/airflow/kubernetes/istio.py b/airflow/kubernetes/istio.py
new file mode 100644
index 0000000000..ea0c6fd8b2
--- /dev/null
+++ b/airflow/kubernetes/istio.py
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tenacity
+from kubernetes.client.rest import ApiException
+from kubernetes.stream import stream
+from packaging.version import parse as semantic_version
+
+from airflow import AirflowException
+from airflow.utils.log.logging_mixin import LoggingMixin
+
+
+class SidecarNames:
+    """Define strings that indicate container names"""
+
+    ISTIO_PROXY = 'istio-proxy'
+
+
+class Istio(LoggingMixin):
+    """Handle all Istio-related logic"""
+
+    def __init__(self, kube_client):
+        super().__init__()
+        self._client = kube_client
+
+    def handle_istio_proxy(self, pod) -> bool:
+        """If an istio-proxy sidecar is detected, and all other containers
+        are terminated, then attempt to cleanly shutdown the sidecar.
+        If we detect a version of Istio before it's compatible with Kubernetes
+        Jobs, then raise an informative error message.
+
+        :param pod: The pod which we are checking for the sidecar
+        :returns: True if we detect and exit istio-proxy, False if we do not detect istio-proxy
+        :rtype: bool
+
+        Raises:
+            AirflowException: if we find an istio-proxy, and we can't shut it down.
+        """
+        if self._should_shutdown_istio_proxy(pod):
+            self.log.info(
+                "Detected that a task finished and needs an istio-proxy sidecar to be cleaned up. "
+                "pod name: %s",
+                pod.metadata.name,
+            )
+            try:
+                self._shutdown_istio_proxy(pod)
+            except ApiException:
+                self.log.debug("Error handling Istio container for pod: %s", pod.metadata.name)
+            return True
+        return False
+
+    def _should_shutdown_istio_proxy(self, pod):
+        """Look for an istio-proxy, and decide if it should be shutdown.
+
+        Args:
+            pod (V1Pod): The pod which we are checking for the sidecar
+
+        Returns:
+            (bool): True if we detect istio-proxy, and all other containers
+                    are finished running, otherwise false
+        """
+        if pod.status.phase != "Running":
+            return False
+        found_istio = False
+        for container_status in pod.status.container_statuses:
+            if container_status.name == SidecarNames.ISTIO_PROXY and container_status.state.running:
+                found_istio = True
+                continue
+            if not container_status.state.terminated:
+                # Any state besides 'terminated' should be
+                # considered still busy
+                return False
+        # If we didn't find istio at all, then we should
+        # not shut it down. Also we should only shut it down
+        # if it has state "running".
+        return found_istio
+
+    def _shutdown_istio_proxy(self, pod):
+        """Shutdown the istio-proxy on the provided pod
+
+        Args:
+            pod (V1Pod): The pod which the container is in
+
+        Returns:
+            None
+
+        Raises:
+            AirflowException: if we find an istio-proxy, and we can't shut it down.
+        """
+        for container in pod.spec.containers:
+
+            # Skip unless it's a sidecar named as SidecarNames.ISTIO_PROXY.
+            if container.name != SidecarNames.ISTIO_PROXY:
+                continue
+
+            # Check if supported version of istio-proxy.
+            # If we can't tell the version, proceed anyways.
+            if ":" in container.image:
+                _, tag = container.image.split(":")
+                if semantic_version(tag) < semantic_version("1.3.0-rc.0"):
+                    raise AirflowException(
+                        'Please use istio version 1.3.0+ for KubernetesExecutor compatibility.'
+                        + f' Detected version {tag}'
+                    )
+
+            # Determine the istio-proxy statusPort,
+            # which is where /quitquitquit is implemented.
+            # Default to 15020.
+            status_port = "15020"
+            for i in range(len(container.args)):
+                arg = container.args[i]
+                if arg.strip() == "--statusPort":
+                    status_port = container.args[i + 1].strip()
+                    break
+                if arg.strip()[:13] == "--statusPort=":
+                    status_port = arg.strip()[13:]
+                    break
+
+            self.log.info("Shutting down istio-proxy in pod %s", pod.metadata.name)
+            self._post_quitquitquit(pod, container, status_port)
+
+    @tenacity.retry(
+        stop=tenacity.stop_after_attempt(3),
+        wait=tenacity.wait_fixed(0.5),
+        reraise=True,
+        retry=tenacity.retry_if_exception_type(ApiException),
+    )
+    def _post_quitquitquit(self, pod, container, status_port):
+        """Send the curl to shutdown the isto-proxy container"""
+        # Use exec to curl localhost inside of the sidecar.
+        try:
+            _ = stream(
+                self._client.connect_get_namespaced_pod_exec,
+                pod.metadata.name,
+                pod.metadata.namespace,
+                tty=False,
+                stderr=True,
+                stdin=False,
+                stdout=True,
+                container=container.name,
+                command=['/bin/sh', '-c', f'curl -XPOST http://127.0.0.1:{status_port}/quitquitquit'],
+            )
+            return
+        except ApiException:
+            # Check if the istio sidecar has already been shut down
+            current_pod = self._client.read_namespaced_pod(
+                name=pod.metadata.name,
+                namespace=pod.metadata.namespace,
+            )
+            if not self._should_shutdown_istio_proxy(current_pod):
+                self.log.info(
+                    "Istio sidecar is already shut down in %s, so continuing on",
+                    pod.metadata.name,
+                )
+                return
+            raise
diff --git a/setup.py b/setup.py
index 4d5dbd1bb8..a5f270e3aa 100644
--- a/setup.py
+++ b/setup.py
@@ -434,6 +434,7 @@ kubernetes = [
     # potential breaking changes in Airflow Core as well (kubernetes is added as extra, so Airflow
     # core is not hard-limited via install-requirements, only by extra).
     'kubernetes>=21.7.0,<24',
+    'packaging>=19.1',
 ]
 kylin = ['kylinpy>=2.6']
 ldap = [
diff --git a/tests/kubernetes/test_istio.py b/tests/kubernetes/test_istio.py
new file mode 100644
index 0000000000..6840ffe86b
--- /dev/null
+++ b/tests/kubernetes/test_istio.py
@@ -0,0 +1,119 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+from unittest.mock import MagicMock, patch
+
+from airflow import AirflowException
+from airflow.kubernetes.istio import Istio
+
+
+def mock_stream(func, *args, **kwargs):
+    print('calling func')
+    return func(*args, **kwargs)
+
+
+class TestIstio(unittest.TestCase):
+    def setUp(self):
+        mock_kube_client = MagicMock()
+        self.istio = Istio(mock_kube_client)
+
+    def _mock_pod(self, image="istio/proxyv2:1.3.0", args=None):
+        sidecar = MagicMock()
+        sidecar.name = "istio-proxy"
+        sidecar.namespace = "fake-namespace"
+        sidecar.image = image
+        sidecar.args = args
+        pod = MagicMock()
+        pod.spec.containers = [sidecar]
+        pod.status.phase = "Running"
+        pod.metadata.name = "fake-pod-name"
+        pod.metadata.namespace = "fake-namespace"
+        container_status1 = MagicMock()
+        container_status1.name = "istio-proxy"
+        container_status1.state.running = True
+        container_status1.state.terminated = False
+        container_status2 = MagicMock()
+        container_status2.name = "base"
+        container_status2.state.running = False
+        container_status2.state.terminated = True
+        pod.status.container_statuses = [container_status1, container_status2]
+        return pod
+
+    def test_handle_istio_proxy_low_version(self):
+        pod = self._mock_pod(image="istio/proxyv2:1.2.9")
+        self.assertRaises(AirflowException, self.istio.handle_istio_proxy, pod)
+
+    def _handle_istio_proxy_with_sidecar_args(self, args):
+        pod = self._mock_pod(args=args)
+        self.istio.handle_istio_proxy(pod)
+
+    @patch("airflow.kubernetes.istio.stream", new=mock_stream)
+    def test_handle_istio_proxy(self):
+        args = ["proxy", "sidecar", "--statusPort", "12345"]
+        self._handle_istio_proxy_with_sidecar_args(args)
+        self.istio._client.connect_get_namespaced_pod_exec.assert_called_once_with(
+            'fake-pod-name',
+            'fake-namespace',
+            tty=False,
+            container='istio-proxy',
+            stderr=True,
+            stdin=False,
+            stdout=True,
+            command=['/bin/sh', '-c', 'curl -XPOST http://127.0.0.1:12345/quitquitquit'],
+        )
+
+    @patch("airflow.kubernetes.istio.stream", new=mock_stream)
+    def test_handle_istio_proxy_other_cli_format(self):
+        args = ["proxy", "sidecar", "--statusPort=12345"]
+        self._handle_istio_proxy_with_sidecar_args(args)
+        self.istio._client.connect_get_namespaced_pod_exec.assert_called_once_with(
+            'fake-pod-name',
+            'fake-namespace',
+            tty=False,
+            container='istio-proxy',
+            stderr=True,
+            stdin=False,
+            stdout=True,
+            command=['/bin/sh', '-c', 'curl -XPOST http://127.0.0.1:12345/quitquitquit'],
+        )
+
+    @patch("airflow.kubernetes.istio.stream", new=mock_stream)
+    def test_handle_istio_proxy_no_cli_argument(self):
+        args = ["proxy", "sidecar"]
+        self._handle_istio_proxy_with_sidecar_args(args)
+        self.istio._client.connect_get_namespaced_pod_exec.assert_called_once_with(
+            'fake-pod-name',
+            'fake-namespace',
+            tty=False,
+            container='istio-proxy',
+            stderr=True,
+            stdin=False,
+            stdout=True,
+            command=['/bin/sh', '-c', 'curl -XPOST http://127.0.0.1:15020/quitquitquit'],
+        )
+
+    @patch("airflow.kubernetes.istio.stream", new=mock_stream)
+    def test_handle_istio_with_no_sidecar(self):
+        pod = MagicMock()
+        pod.spec.containers = []
+        self.istio.handle_istio_proxy(MagicMock())
+        self.istio._client.connect_get_namespaced_pod_exec.assert_not_called()
+
+
+if __name__ == "__main__":
+    unittest.main()