You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by po...@apache.org on 2021/01/02 00:43:00 UTC

[airflow] branch master updated: Change timeout s and disables reverse IP lookup for integrations (#13424)

This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/airflow.git


The following commit(s) were added to refs/heads/master by this push:
     new ae625b4  Change timeout s and disables reverse IP lookup for integrations (#13424)
ae625b4 is described below

commit ae625b44839c42ace4ffa5c83a9b057ab01918dc
Author: Jarek Potiuk <ja...@polidea.com>
AuthorDate: Sat Jan 2 01:42:39 2021 +0100

    Change timeout s and disables reverse IP lookup for integrations (#13424)
    
    Seems that we are hitting more often one of the most favourite
    bugs by Ash: DNS. Quote: "It's always DNS".
    
    It looks like there is a race condition with docker compose
    that causes services that started fast enough (before DNS)
    to get a different reverse-DNS IP lookup (usually it is
    just `<SERVICE>` but sometimes it is
    `<DOCKER_COMPOSE_APP>_<SERVICE>_1_<NETWORK>`).
    This produces misleading messages in log that might
    make analysis of such problems difficult, that's why
    we chose to get rid of the reverse lookup and give
    bigger time for each service to check if it is ready.
    
    Netcat, unfortunately performs both forward and reverse
    lookup when given a name - forward lookup to find the
    IP address and reverse lookup to write information to the
    log about the host it connected to - and if it sees
    that the original and reverse-looked-up names do not match
    even if it manages to connect, it retunrs an error:
    
    `DNS fwd/rev mismatch` - which is very misleading.
    
    This change performs the following:
    
    1) We lookup the host name in python via gethostbyname
    2) We set -n in netcat to disable ANY DNS use
    3) We feed netcat with the IP address
    4) We've standardized all waiting times to be up to 50 seconds
    
    This way we should get rid of the DNS fwd/rev mismatch once
    and for all.
---
 scripts/in_container/check_environment.sh    | 50 +++++++++++++++++++---------
 scripts/in_container/prod/entrypoint_prod.sh | 20 ++++++++++-
 2 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/scripts/in_container/check_environment.sh b/scripts/in_container/check_environment.sh
index 82bbad5..f684746 100755
--- a/scripts/in_container/check_environment.sh
+++ b/scripts/in_container/check_environment.sh
@@ -21,6 +21,24 @@ EXIT_CODE=0
 
 DISABLED_INTEGRATIONS=""
 
+# We want to avoid misleading messages and perform only forward lookup of the service IP address.
+# Netcat when run without -n performs both forward and reverse lookup and fails if the reverse
+# lookup name does not match the original name even if the host is reachable via IP. This happens
+# randomly with docker-compose in Github Actions.
+# Since we are not using reverse lookup elsewhere, we can perform forward lookup in python
+# And use the IP in NC and add '-n' switch to disable any DNS use.
+# Even if this message might be harmless, it might hide the real reason for the problem
+# Which is the long time needed to start some services, seeing this message might be totally misleading
+# when you try to analyse the problem, that's why it's best to avoid it,
+function run_nc() {
+    local host=${1}
+    local port=${2}
+    local ip
+    ip=$(python -c "import socket; print(socket.gethostbyname('${host}'))")
+
+    nc -zvvn "${ip}" "${port}"
+}
+
 function check_service {
     LABEL=$1
     CALL=$2
@@ -77,9 +95,9 @@ function check_db_backend {
     MAX_CHECK=${1:=1}
 
     if [[ ${BACKEND} == "postgres" ]]; then
-        check_service "PostgreSQL" "nc -zvv postgres 5432" "${MAX_CHECK}"
+        check_service "PostgreSQL" "run_nc postgres 5432" "${MAX_CHECK}"
     elif [[ ${BACKEND} == "mysql" ]]; then
-        check_service "MySQL" "nc -zvv mysql 3306" "${MAX_CHECK}"
+        check_service "MySQL" "run_nc mysql 3306" "${MAX_CHECK}"
     elif [[ ${BACKEND} == "sqlite" ]]; then
         return
     else
@@ -134,26 +152,26 @@ echo "==========================================================================
 echo "             Checking integrations and backends"
 echo "==============================================================================================="
 if [[ -n ${BACKEND=} ]]; then
-    check_db_backend 20
+    check_db_backend 50
     echo "-----------------------------------------------------------------------------------------------"
 fi
-check_integration "Kerberos" "kerberos" "nc -zvv kdc-server-example-com 88" 30
-check_integration "MongoDB" "mongo" "nc -zvv mongo 27017" 20
-check_integration "Redis" "redis" "nc -zvv redis 6379" 20
-check_integration "RabbitMQ" "rabbitmq" "nc -zvv rabbitmq 5672" 20
-check_integration "Cassandra" "cassandra" "nc -zvv cassandra 9042" 20
-check_integration "OpenLDAP" "openldap" "nc -zvv openldap 389" 20
-check_integration "Presto (HTTP)" "presto" "nc -zvv presto 8080" 40
-check_integration "Presto (HTTPS)" "presto" "nc -zvv presto 7778" 40
+check_integration "Kerberos" "kerberos" "run_nc kdc-server-example-com 88" 50
+check_integration "MongoDB" "mongo" "run_nc mongo 27017" 50
+check_integration "Redis" "redis" "run_nc redis 6379" 50
+check_integration "Cassandra" "cassandra" "run_nc cassandra 9042" 50
+check_integration "OpenLDAP" "openldap" "run_nc openldap 389" 50
+check_integration "Presto (HTTP)" "presto" "run_nc presto 8080" 50
+check_integration "Presto (HTTPS)" "presto" "run_nc presto 7778" 50
 check_integration "Presto (API)" "presto" \
-    "curl --max-time 1 http://presto:8080/v1/info/ | grep '\"starting\":false'" 20
-check_integration "Pinot (HTTP)" "pinot" "nc -zvv pinot 9000" 40
+    "curl --max-time 1 http://presto:8080/v1/info/ | grep '\"starting\":false'" 50
+check_integration "Pinot (HTTP)" "pinot" "run_nc pinot 9000" 50
 CMD="curl --max-time 1 -X GET 'http://pinot:9000/health' -H 'accept: text/plain' | grep OK"
-check_integration "Presto (Controller API)" "pinot" "${CMD}" 20
+check_integration "Presto (Controller API)" "pinot" "${CMD}" 50
 CMD="curl --max-time 1 -X GET 'http://pinot:9000/pinot-controller/admin' -H 'accept: text/plain' | grep GOOD"
-check_integration "Presto (Controller API)" "pinot" "${CMD}" 20
+check_integration "Presto (Controller API)" "pinot" "${CMD}" 50
 CMD="curl --max-time 1 -X GET 'http://pinot:8000/health' -H 'accept: text/plain' | grep OK"
-check_integration "Presto (Broker API)" "pinot" "${CMD}" 20
+check_integration "Presto (Broker API)" "pinot" "${CMD}" 50
+check_integration "RabbitMQ" "rabbitmq" "run_nc rabbitmq 5672" 50
 
 echo "-----------------------------------------------------------------------------------------------"
 
diff --git a/scripts/in_container/prod/entrypoint_prod.sh b/scripts/in_container/prod/entrypoint_prod.sh
index d0fde62..357e833 100755
--- a/scripts/in_container/prod/entrypoint_prod.sh
+++ b/scripts/in_container/prod/entrypoint_prod.sh
@@ -21,6 +21,24 @@ AIRFLOW_COMMAND="${1}"
 
 set -euo pipefail
 
+# We want to avoid misleading messages and perform only forward lookup of the service IP address.
+# Netcat when run without -n performs both forward and reverse lookup and fails if the reverse
+# lookup name does not match the original name even if the host is reachable via IP. This happens
+# randomly with docker-compose in Github Actions.
+# Since we are not using reverse lookup elsewhere, we can perform forward lookup in python
+# And use the IP in NC and add '-n' switch to disable any DNS use.
+# Even if this message might be harmless, it might hide the real reason for the problem
+# Which is the long time needed to start some services, seeing this message might be totally misleading
+# when you try to analyse the problem, that's why it's best to avoid it,
+function run_nc() {
+    local host=${1}
+    local port=${2}
+    local ip
+    ip=$(python -c "import socket; print(socket.gethostbyname('${host}'))")
+
+    nc -zvvn "${ip}" "${port}"
+}
+
 function verify_db_connection {
     DB_URL="${1}"
 
@@ -61,7 +79,7 @@ function verify_db_connection {
         while true
         do
             set +e
-            LAST_CHECK_RESULT=$(nc -zvv "${DB_HOST}" "${DB_PORT}" >/dev/null 2>&1)
+            LAST_CHECK_RESULT=$(run_nc "${DB_HOST}" "${DB_PORT}" >/dev/null 2>&1)
             RES=$?
             set -e
             if [[ ${RES} == 0 ]]; then