You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@flink.apache.org by GitBox <gi...@apache.org> on 2018/11/23 10:34:23 UTC
[GitHub] asfgit closed pull request #7073: [FLINK-10842][E2E tests] fix
broken waiting loops in common.sh
asfgit closed pull request #7073: [FLINK-10842][E2E tests] fix broken waiting loops in common.sh
URL: https://github.com/apache/flink/pull/7073
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/flink-end-to-end-tests/test-scripts/common.sh b/flink-end-to-end-tests/test-scripts/common.sh
index 275d9c49f4b..645cf2a2f1e 100644
--- a/flink-end-to-end-tests/test-scripts/common.sh
+++ b/flink-end-to-end-tests/test-scripts/common.sh
@@ -212,19 +212,22 @@ function start_local_zk {
function wait_dispatcher_running {
# wait at most 10 seconds until the dispatcher is up
local QUERY_URL="${REST_PROTOCOL}://${NODENAME}:8081/taskmanagers"
- for i in {1..10}; do
+ local TIMEOUT=10
+ for i in $(seq 1 ${TIMEOUT}); do
# without the || true this would exit our script if the JobManager is not yet up
QUERY_RESULT=$(curl ${CURL_SSL_ARGS} "$QUERY_URL" 2> /dev/null || true)
# ensure the taskmanagers field is there at all and is not empty
if [[ ${QUERY_RESULT} =~ \{\"taskmanagers\":\[.+\]\} ]]; then
echo "Dispatcher REST endpoint is up."
- break
+ return
fi
echo "Waiting for dispatcher REST endpoint to come up..."
sleep 1
done
+ echo "Dispatcher REST endpoint has not started within a timeout of ${TIMEOUT} sec"
+ exit 1
}
function start_cluster {
@@ -242,30 +245,45 @@ function start_taskmanagers {
}
function start_and_wait_for_tm {
- local url="${REST_PROTOCOL}://${NODENAME}:8081/taskmanagers"
-
- tm_query_result=$(curl ${CURL_SSL_ARGS} -s "${url}")
-
+ tm_query_result=`query_running_tms`
# we assume that the cluster is running
if ! [[ ${tm_query_result} =~ \{\"taskmanagers\":\[.*\]\} ]]; then
echo "Your cluster seems to be unresponsive at the moment: ${tm_query_result}" 1>&2
exit 1
fi
- running_tms=`curl ${CURL_SSL_ARGS} -s "${url}" | grep -o "id" | wc -l`
-
+ running_tms=`query_number_of_running_tms`
${FLINK_DIR}/bin/taskmanager.sh start
+ wait_for_number_of_running_tms $((running_tms+1))
+}
- for i in {1..10}; do
- local new_running_tms=`curl ${CURL_SSL_ARGS} -s "${url}" | grep -o "id" | wc -l`
- if [ $((new_running_tms-running_tms)) -eq 0 ]; then
- echo "TaskManager is not yet up."
+function query_running_tms {
+ local url="${REST_PROTOCOL}://${NODENAME}:8081/taskmanagers"
+ curl ${CURL_SSL_ARGS} -s "${url}"
+}
+
+function query_number_of_running_tms {
+ query_running_tms | grep -o "id" | wc -l
+}
+
+function wait_for_number_of_running_tms {
+ local TM_NUM_TO_WAIT=${1}
+ local TIMEOUT_COUNTER=10
+ local TIMEOUT_INC=4
+ local TIMEOUT=$(( $TIMEOUT_COUNTER * $TIMEOUT_INC ))
+ local TM_NUM_TEXT="Number of running task managers"
+ for i in $(seq 1 ${TIMEOUT_COUNTER}); do
+ local TM_NUM=`query_number_of_running_tms`
+ if [ $((TM_NUM - TM_NUM_TO_WAIT)) -eq 0 ]; then
+ echo "${TM_NUM_TEXT} has reached ${TM_NUM_TO_WAIT}."
+ return
else
- echo "TaskManager is up."
- break
+ echo "${TM_NUM_TEXT} ${TM_NUM} is not yet ${TM_NUM_TO_WAIT}."
fi
- sleep 4
+ sleep ${TIMEOUT_INC}
done
+ echo "${TM_NUM_TEXT} has not reached ${TM_NUM_TO_WAIT} within a timeout of ${TIMEOUT} sec"
+ exit 1
}
function check_logs_for_errors {
@@ -376,17 +394,20 @@ function wait_for_job_state_transition {
}
function wait_job_running {
- for i in {1..10}; do
+ local TIMEOUT=10
+ for i in $(seq 1 ${TIMEOUT}); do
JOB_LIST_RESULT=$("$FLINK_DIR"/bin/flink list -r | grep "$1")
if [[ "$JOB_LIST_RESULT" == "" ]]; then
echo "Job ($1) is not yet running."
else
echo "Job ($1) is running."
- break
+ return
fi
sleep 1
done
+ echo "Job ($1) has not started within a timeout of ${TIMEOUT} sec"
+ exit 1
}
function wait_job_terminal_state {
diff --git a/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh b/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh
index 56d811e14a1..db0174a2160 100755
--- a/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh
+++ b/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh
@@ -86,6 +86,7 @@ function run_test() {
fi
kill_random_taskmanager
+ wait_for_number_of_running_tms 0
latest_snapshot_count=$(cat $FLINK_DIR/log/*out* | grep "on snapshot" | tail -n 1 | awk '{print $4}')
echo "Latest snapshot count was ${latest_snapshot_count}"
diff --git a/flink-end-to-end-tests/test-scripts/test_resume_externalized_checkpoints.sh b/flink-end-to-end-tests/test-scripts/test_resume_externalized_checkpoints.sh
index 35fe30b6b25..48d68c98e9d 100755
--- a/flink-end-to-end-tests/test-scripts/test_resume_externalized_checkpoints.sh
+++ b/flink-end-to-end-tests/test-scripts/test_resume_externalized_checkpoints.sh
@@ -100,11 +100,10 @@ fi
DATASTREAM_JOB=$($JOB_CMD | grep "Job has been submitted with JobID" | sed 's/.* //g')
-wait_job_running $DATASTREAM_JOB
-
if [[ $SIMULATE_FAILURE == "true" ]]; then
wait_job_terminal_state $DATASTREAM_JOB FAILED
else
+ wait_job_running $DATASTREAM_JOB
wait_num_checkpoints $DATASTREAM_JOB 1
wait_oper_metric_num_in_records SemanticsCheckMapper.0 200
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
With regards,
Apache Git Services