You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@flink.apache.org by gy...@apache.org on 2022/11/22 21:20:57 UTC

[flink-kubernetes-operator] branch main updated: [FLINK-29475] Add error checker for the operator in e2e tests

This is an automated email from the ASF dual-hosted git repository.

gyfora pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/flink-kubernetes-operator.git


The following commit(s) were added to refs/heads/main by this push:
     new af00c99d [FLINK-29475] Add error checker for the operator in e2e tests
af00c99d is described below

commit af00c99defbe49c84dbd8a3ac4341136ca3efac9
Author: Gabor Somogyi <ga...@apple.com>
AuthorDate: Mon Nov 21 10:15:15 2022 +0100

    [FLINK-29475] Add error checker for the operator in e2e tests
---
 e2e-tests/test_application_kubernetes_ha.sh |  2 ++
 e2e-tests/test_application_operations.sh    |  2 ++
 e2e-tests/test_multi_sessionjob.sh          |  2 ++
 e2e-tests/test_sessionjob_kubernetes_ha.sh  |  2 ++
 e2e-tests/test_sessionjob_operations.sh     |  2 ++
 e2e-tests/utils.sh                          | 29 +++++++++++++++++++++++++++++
 6 files changed, 39 insertions(+)

diff --git a/e2e-tests/test_application_kubernetes_ha.sh b/e2e-tests/test_application_kubernetes_ha.sh
index 1797b29a..eda15bc6 100755
--- a/e2e-tests/test_application_kubernetes_ha.sh
+++ b/e2e-tests/test_application_kubernetes_ha.sh
@@ -47,5 +47,7 @@ wait_for_logs $jm_pod_name "Completed checkpoint [0-9]+ for job" ${TIMEOUT} || e
 wait_for_status flinkdep/flink-example-statemachine '.status.jobManagerDeploymentStatus' READY ${TIMEOUT} || exit 1
 wait_for_status flinkdep/flink-example-statemachine '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1
 
+check_operator_log_for_errors || exit 1
+
 echo "Successfully run the Flink Kubernetes application HA test"
 
diff --git a/e2e-tests/test_application_operations.sh b/e2e-tests/test_application_operations.sh
index 457972c8..f6d1ace3 100755
--- a/e2e-tests/test_application_operations.sh
+++ b/e2e-tests/test_application_operations.sh
@@ -67,4 +67,6 @@ wait_for_status flinkdep/flink-example-statemachine '.status.jobManagerDeploymen
 wait_for_status flinkdep/flink-example-statemachine '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1
 assert_available_slots 1 $CLUSTER_ID
 
+check_operator_log_for_errors || exit 1
+
 echo "Successfully run the last-state upgrade test"
diff --git a/e2e-tests/test_multi_sessionjob.sh b/e2e-tests/test_multi_sessionjob.sh
index 59990870..09862db5 100755
--- a/e2e-tests/test_multi_sessionjob.sh
+++ b/e2e-tests/test_multi_sessionjob.sh
@@ -38,6 +38,7 @@ jm_pod_name=$(get_jm_pod_name $CLUSTER_ID)
 wait_for_logs $jm_pod_name "Completed checkpoint [0-9]+ for job" ${TIMEOUT} || exit 1
 wait_for_status $SESSION_CLUSTER_IDENTIFIER '.status.jobManagerDeploymentStatus' READY ${TIMEOUT} || exit 1
 wait_for_status $SESSION_JOB_IDENTIFIER '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1
+check_operator_log_for_errors || exit 1
 echo "Flink Session Job is running properly"
 
 # Current namespace: flink
@@ -48,4 +49,5 @@ jm_pod_name=$(get_jm_pod_name $CLUSTER_ID)
 wait_for_logs $jm_pod_name "Completed checkpoint [0-9]+ for job" ${TIMEOUT} || exit 1
 wait_for_status $SESSION_CLUSTER_IDENTIFIER '.status.jobManagerDeploymentStatus' READY ${TIMEOUT} || exit 1
 wait_for_status $SESSION_JOB_IDENTIFIER '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1
+check_operator_log_for_errors || exit 1
 echo "Flink Session Job is running properly"
diff --git a/e2e-tests/test_sessionjob_kubernetes_ha.sh b/e2e-tests/test_sessionjob_kubernetes_ha.sh
index 0ad55b12..7a0fa813 100755
--- a/e2e-tests/test_sessionjob_kubernetes_ha.sh
+++ b/e2e-tests/test_sessionjob_kubernetes_ha.sh
@@ -48,5 +48,7 @@ wait_for_logs $jm_pod_name "Completed checkpoint [0-9]+ for job" ${TIMEOUT} || e
 wait_for_status $SESSION_CLUSTER_IDENTIFIER '.status.jobManagerDeploymentStatus' READY ${TIMEOUT} || exit 1
 wait_for_status $SESSION_JOB_IDENTIFIER '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1
 
+check_operator_log_for_errors || exit 1
+
 echo "Successfully run the Flink Session Job HA test"
 
diff --git a/e2e-tests/test_sessionjob_operations.sh b/e2e-tests/test_sessionjob_operations.sh
index b1c88fc2..c230af8c 100755
--- a/e2e-tests/test_sessionjob_operations.sh
+++ b/e2e-tests/test_sessionjob_operations.sh
@@ -79,3 +79,5 @@ wait_for_jobmanager_running $CLUSTER_ID $TIMEOUT
 wait_for_logs $jm_pod_name "Completed checkpoint [0-9]+ for job" ${TIMEOUT} || exit 1
 wait_for_status $SESSION_CLUSTER_IDENTIFIER '.status.jobManagerDeploymentStatus' READY ${TIMEOUT} || exit 1
 wait_for_status $SESSION_JOB_IDENTIFIER '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1
+
+check_operator_log_for_errors || exit 1
diff --git a/e2e-tests/utils.sh b/e2e-tests/utils.sh
index b8df6a42..447f038c 100755
--- a/e2e-tests/utils.sh
+++ b/e2e-tests/utils.sh
@@ -83,6 +83,11 @@ function wait_for_jobmanager_running() {
     wait_for_logs $jm_pod_name "Rest endpoint listening at" ${TIMEOUT} || exit 1
 }
 
+function get_operator_pod_name() {
+   operator_pod_name=$(kubectl get pods --selector="app.kubernetes.io/name=flink-kubernetes-operator" -o jsonpath='{..metadata.name}')
+   echo "${operator_pod_name}"
+}
+
 function get_jm_pod_name() {
    CLUSTER_ID=$1
    jm_pod_name=$(kubectl get pods --selector="app=${CLUSTER_ID},component=jobmanager" -o jsonpath='{..metadata.name}')
@@ -108,12 +113,36 @@ function retry_times() {
     return 1
 }
 
+function check_operator_log_for_errors {
+  echo "Checking for operator log errors..."
+  operator_pod_name=$(get_operator_pod_name)
+  errors=$(kubectl logs "${operator_pod_name}" \
+      | grep -v "Exception while listing jobs" `#https://issues.apache.org/jira/browse/FLINK-30146` \
+      | grep -v "Failed to submit a listener notification task" `#https://issues.apache.org/jira/browse/FLINK-30147` \
+      | grep -v "Failed to submit job to session cluster" `#https://issues.apache.org/jira/browse/FLINK-30148` \
+      | grep -v "Error during event processing" `#https://issues.apache.org/jira/browse/FLINK-30149` \
+      | grep -v "REST service in session cluster is bad now" `#https://issues.apache.org/jira/browse/FLINK-30150` \
+      | grep -v "AuditUtils" `#https://issues.apache.org/jira/browse/FLINK-30151` \
+      | grep -i "error" || true)
+  if [ -z "${errors}" ]; then
+    echo "No errors in log files."
+    return 0
+  else
+    echo -e "Found error in log files.\n\n${errors}"
+    return 1
+  fi
+}
+
 function debug_and_show_logs {
     echo "Debugging failed e2e test:"
     echo "Currently existing Kubernetes resources"
     kubectl get all
     kubectl describe all
 
+    echo "Operator logs:"
+    operator_pod_name=$(get_operator_pod_name)
+    kubectl logs "${operator_pod_name}"
+
     echo "Flink logs:"
     kubectl get pods -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | while read pod;do
         containers=(`kubectl get pods  $pod -o jsonpath='{.spec.containers[*].name}'`)