You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/11/20 21:59:56 UTC
[impala] 02/02: IMPALA-9165: Add timeout for create-load-data.sh
This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
View the commit online:
https://github.com/apache/impala/commit/fc4a91cf8c87966a910106dded7e7eb8d215270a
commit fc4a91cf8c87966a910106dded7e7eb8d215270a
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Mon Nov 18 18:01:14 2019 -0800
IMPALA-9165: Add timeout for create-load-data.sh
This converts the existing bin/run-all-tests-timeout-check.sh
to a more generic bin/script-timeout-check.sh. It uses this
new script for both bin/run-all-tests.sh and
testdata/bin/create-load-data.sh. The new script takes two
arguments:
-timeout : timeout in minutes
-script_name : name of the calling script
The script_name is used in debugging output / output filenames
to make it clear what timed out.
The run-all-tests.sh timeout remains the same.
testdata/bin/create-load-data.sh uses a 2.5 hour timeout.
This should help debug the issue in IMPALA-9165, because at
least the logs would be preserved on the Jenkins job.
Testing:
- Tested the timeout script by hand with a caller script that
sleeps longer than the timeout
- Ran a gerrit-verify-dryrun-external
Change-Id: I19d76bd8850c7d4b5affff4d21f32d8715a382c6
Reviewed-on: http://gerrit.cloudera.org:8080/14741
Reviewed-by: Joe McDonnell <jo...@cloudera.com>
Tested-by: Joe McDonnell <jo...@cloudera.com>
---
bin/run-all-tests-timeout-check.sh | 70 -----------------------
bin/run-all-tests.sh | 3 +-
bin/script-timeout-check.sh | 111 +++++++++++++++++++++++++++++++++++++
testdata/bin/create-load-data.sh | 16 ++++++
4 files changed, 129 insertions(+), 71 deletions(-)
diff --git a/bin/run-all-tests-timeout-check.sh b/bin/run-all-tests-timeout-check.sh
deleted file mode 100755
index dda9462..0000000
--- a/bin/run-all-tests-timeout-check.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Script run by run-all-tests.sh that checks every 60 sec if the timeout expired and on
-# timeout prints the stacktraces of all impalads and then finally kills running tests.
-# Takes the timeout in minutes as an argument.
-
-SLEEP_TIMEOUT_S=0
-if [ -z "$1" ]; then
- echo "Expected timeout value as an argument"
- exit 1
-else
- SLEEP_TIMEOUT_S=$(($1 * 60))
-fi
-
-[[ $SLEEP_TIMEOUT_S < 1 ]] && exit
-
-echo
-echo
-echo "**** Timout Timer Started (pid $$, ppid $PPID) for $SLEEP_TIMEOUT_S s! ****"
-echo
-echo
-
-# Check timer every 60 seconds and only proceed if the parent process is still alive.
-# Note: $SECONDS is a bash built-in that counts seconds since bash started.
-while ((SLEEP_TIMEOUT_S - SECONDS > 0)); do
- sleep 1
- if ! ps $PPID &> /dev/null; then
- echo "Timeout Timer Exited because $PPID is gone."
- exit
- fi
-done
-
-echo
-echo
-echo '**** Tests TIMED OUT! ****'
-echo
-echo
-
-# Impala probably has a thread stuck. Print the stacktrace to the console output.
-mkdir -p "$IMPALA_TIMEOUT_LOGS_DIR"
-for pid in $(pgrep impalad); do
- echo "**** Generating stacktrace of impalad with process id: $pid ****"
- gdb -ex "thread apply all bt" --batch -p $pid > "${IMPALA_TIMEOUT_LOGS_DIR}/${pid}.txt"
-done
-
-# Now kill any running tests.
-kill $PPID
-
-"${IMPALA_HOME}"/bin/generate_junitxml.py --step "test_run" --error "Test run timed out.
-This probably happened due to a hung thread which can be confirmed by looking at the
-stacktrace of running impalad processes at ${IMPALA_TIMEOUT_LOGS_DIR}"
-
-
diff --git a/bin/run-all-tests.sh b/bin/run-all-tests.sh
index bc849f0..6d4919b 100755
--- a/bin/run-all-tests.sh
+++ b/bin/run-all-tests.sh
@@ -121,7 +121,8 @@ do
esac
done
-"${IMPALA_HOME}/bin/run-all-tests-timeout-check.sh" $TIMEOUT_FOR_RUN_ALL_TESTS_MINS &
+"${IMPALA_HOME}/bin/script-timeout-check.sh" -timeout $TIMEOUT_FOR_RUN_ALL_TESTS_MINS \
+ -script_name "$(basename $0)" &
TIMEOUT_PID=$!
# IMPALA-3947: "Exhaustive" tests are actually based on workload. This
diff --git a/bin/script-timeout-check.sh b/bin/script-timeout-check.sh
new file mode 100755
index 0000000..ad5fd04
--- /dev/null
+++ b/bin/script-timeout-check.sh
@@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Helper script that checks every 60 sec if the timeout expired and on timeout prints
+# the stacktraces of all impalads (if running) and then finally kills the caller script.
+# Takes two required arguments:
+# -script_name : name of the caller script (only for debug messages / output filenames)
+# -timeout : the timeout in minutes
+#
+# The way to use it is:
+# ${IMPALA_HOME}/bin/script-timeout-check.sh -timeout {TIMEOUT} -script_name {NAME} &
+# TIMEOUT_PID=$!
+# ... body of script ...
+# # Kill the spawned timeout process and its child sleep process. There may not be
+# # a sleep process, so ignore failure.
+# pkill -P $TIMEOUT_PID || true
+# kill $TIMEOUT_PID
+
+SCRIPT_NAME=""
+SLEEP_TIMEOUT_MIN=""
+
+# Parse commandline options
+while [ -n "$*" ]
+do
+ case $1 in
+ -timeout)
+ SLEEP_TIMEOUT_MIN=${2-}
+ shift;
+ ;;
+ -script_name)
+ SCRIPT_NAME=${2-}
+ shift;
+ ;;
+ -help|-h|*)
+ echo "script-timeout-check.sh : aborts caller script if timeout expires"
+ echo "[-timeout] : The timeout in minutes (required)"
+ echo "[-script_name] : The name of the caller script (required)"
+ exit 1;
+ ;;
+ esac
+ shift;
+done
+
+if [ -z "$SLEEP_TIMEOUT_MIN" ]; then
+ echo "Must pass a -timeout flag with a valid timeout as an argument"
+ exit 1
+fi;
+
+if [ -z "$SCRIPT_NAME" ]; then
+ echo "Must pass a -script_name flag with an appropriate argument"
+ exit 1
+fi;
+
+SLEEP_TIMEOUT_S=$((${SLEEP_TIMEOUT_MIN} * 60))
+
+[[ $SLEEP_TIMEOUT_S < 1 ]] && exit
+
+echo
+echo
+echo "**** Timeout Timer Started (pid $$, ppid $PPID) for $SLEEP_TIMEOUT_S s! ****"
+echo
+echo
+
+# Check timer every 60 seconds and only proceed if the parent process is still alive.
+# Note: $SECONDS is a bash built-in that counts seconds since bash started.
+while ((SLEEP_TIMEOUT_S - SECONDS > 0)); do
+ sleep 1
+ if ! ps $PPID &> /dev/null; then
+ echo "Timeout Timer Exited because $SCRIPT_NAME PID $PPID is gone."
+ exit
+ fi
+done
+
+echo
+echo
+echo "**** ${SCRIPT_NAME} TIMED OUT! ****"
+echo
+echo
+
+# Impala might have a thread stuck. Print the stacktrace to the console output.
+mkdir -p "$IMPALA_TIMEOUT_LOGS_DIR"
+for pid in $(pgrep impalad); do
+ echo "**** Generating stacktrace of impalad with process id: $pid ****"
+ gdb -ex "thread apply all bt" --batch -p $pid > "${IMPALA_TIMEOUT_LOGS_DIR}/${pid}.txt"
+done
+
+# Now kill the caller
+kill $PPID
+
+"${IMPALA_HOME}"/bin/generate_junitxml.py --step "${SCRIPT_NAME}" \
+--error "Script ${SCRIPT_NAME} timed out. This probably happened due to a hung
+thread which can be confirmed by looking at the stacktrace of running impalad
+processes at ${IMPALA_TIMEOUT_LOGS_DIR}"
+
+
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 6798567..6488851 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -43,6 +43,8 @@ setup_report_build_error
: ${REMOTE_LOAD=}
: ${CM_HOST=}
: ${IMPALA_SERIAL_DATALOAD=}
+# We don't expect dataload to take more than 2.5 hours.
+: ${TIMEOUT_FOR_CREATE_LOAD_DATA_MINS:= 150}
SKIP_METADATA_LOAD=0
SKIP_SNAPSHOT_LOAD=0
@@ -90,6 +92,10 @@ do
-skip_ranger)
SKIP_RANGER=1
;;
+ -timeout)
+ TIMEOUT_FOR_CREATE_LOAD_DATA_MINS=${2-}
+ shift;
+ ;;
-help|-h|*)
echo "create-load-data.sh : Creates data and loads from scratch"
echo "[-skip_metadata_load] : Skips loading of metadata"
@@ -97,6 +103,7 @@ do
echo "[-snapshot_file] : Loads the test warehouse snapshot into hdfs"
echo "[-cm_host] : Address of the Cloudera Manager host if loading to a remote cluster"
echo "[-skip_ranger] : Skip the set-up for Ranger."
+ echo "[-timeout] : The timeout in minutes for loading data."
exit 1;
;;
esac
@@ -107,6 +114,10 @@ if [[ -n $REMOTE_LOAD ]]; then
SKIP_RANGER=1
fi
+"${IMPALA_HOME}/bin/script-timeout-check.sh" -timeout $TIMEOUT_FOR_CREATE_LOAD_DATA_MINS \
+ -script_name "$(basename $0)" &
+TIMEOUT_PID=$!
+
if [[ $SKIP_METADATA_LOAD -eq 0 && "$SNAPSHOT_FILE" = "" ]]; then
run-step "Generating HBase data" create-hbase.log \
${IMPALA_HOME}/testdata/bin/create-hbase.sh
@@ -704,3 +715,8 @@ fi
# restarting the minicluster works and doesn't impact the tests. This is a common
# operation for developers, so it is nice to test it.
restart-cluster
+
+# Kill the spawned timeout process and its child sleep process.
+# There may not be a sleep process, so ignore failure.
+pkill -P $TIMEOUT_PID || true
+kill $TIMEOUT_PID