You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/11/20 21:59:56 UTC

[impala] 02/02: IMPALA-9165: Add timeout for create-load-data.sh

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

View the commit online:
https://github.com/apache/impala/commit/fc4a91cf8c87966a910106dded7e7eb8d215270a

commit fc4a91cf8c87966a910106dded7e7eb8d215270a
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Mon Nov 18 18:01:14 2019 -0800

    IMPALA-9165: Add timeout for create-load-data.sh
    
    This converts the existing bin/run-all-tests-timeout-check.sh
    to a more generic bin/script-timeout-check.sh. It uses this
    new script for both bin/run-all-tests.sh and
    testdata/bin/create-load-data.sh. The new script takes two
    arguments:
     -timeout : timeout in minutes
     -script_name : name of the calling script
    The script_name is used in debugging output / output filenames
    to make it clear what timed out.
    
    The run-all-tests.sh timeout remains the same.
    testdata/bin/create-load-data.sh uses a 2.5 hour timeout.
    This should help debug the issue in IMPALA-9165, because at
    least the logs would be preserved on the Jenkins job.
    
    Testing:
     - Tested the timeout script by hand with a caller script that
       sleeps longer than the timeout
     - Ran a gerrit-verify-dryrun-external
    
    Change-Id: I19d76bd8850c7d4b5affff4d21f32d8715a382c6
    Reviewed-on: http://gerrit.cloudera.org:8080/14741
    Reviewed-by: Joe McDonnell <jo...@cloudera.com>
    Tested-by: Joe McDonnell <jo...@cloudera.com>
---
 bin/run-all-tests-timeout-check.sh |  70 -----------------------
 bin/run-all-tests.sh               |   3 +-
 bin/script-timeout-check.sh        | 111 +++++++++++++++++++++++++++++++++++++
 testdata/bin/create-load-data.sh   |  16 ++++++
 4 files changed, 129 insertions(+), 71 deletions(-)

diff --git a/bin/run-all-tests-timeout-check.sh b/bin/run-all-tests-timeout-check.sh
deleted file mode 100755
index dda9462..0000000
--- a/bin/run-all-tests-timeout-check.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Script run by run-all-tests.sh that checks every 60 sec if the timeout expired and on
-# timeout prints the stacktraces of all impalads and then finally kills running tests.
-# Takes the timeout in minutes as an argument.
-
-SLEEP_TIMEOUT_S=0
-if [ -z "$1" ]; then
-  echo "Expected timeout value as an argument"
-  exit 1
-else
-  SLEEP_TIMEOUT_S=$(($1 * 60))
-fi
-
-[[ $SLEEP_TIMEOUT_S < 1 ]] && exit
-
-echo
-echo
-echo "**** Timout Timer Started (pid $$, ppid $PPID) for $SLEEP_TIMEOUT_S s! ****"
-echo
-echo
-
-# Check timer every 60 seconds and only proceed if the parent process is still alive.
-# Note: $SECONDS is a bash built-in that counts seconds since bash started.
-while ((SLEEP_TIMEOUT_S - SECONDS > 0)); do
-  sleep 1
-  if ! ps $PPID &> /dev/null; then
-    echo "Timeout Timer Exited because $PPID is gone."
-    exit
-  fi
-done
-
-echo
-echo
-echo '**** Tests TIMED OUT! ****'
-echo
-echo
-
-# Impala probably has a thread stuck. Print the stacktrace to the console output.
-mkdir -p "$IMPALA_TIMEOUT_LOGS_DIR"
-for pid in $(pgrep impalad); do
-  echo "**** Generating stacktrace of impalad with process id: $pid ****"
-  gdb -ex "thread apply all bt"  --batch -p $pid > "${IMPALA_TIMEOUT_LOGS_DIR}/${pid}.txt"
-done
-
-# Now kill any running tests.
-kill $PPID
-
-"${IMPALA_HOME}"/bin/generate_junitxml.py --step "test_run" --error "Test run timed out.
-This probably happened due to a hung thread which can be confirmed by looking at the
-stacktrace of running impalad processes at ${IMPALA_TIMEOUT_LOGS_DIR}"
-
-
diff --git a/bin/run-all-tests.sh b/bin/run-all-tests.sh
index bc849f0..6d4919b 100755
--- a/bin/run-all-tests.sh
+++ b/bin/run-all-tests.sh
@@ -121,7 +121,8 @@ do
   esac
 done
 
-"${IMPALA_HOME}/bin/run-all-tests-timeout-check.sh" $TIMEOUT_FOR_RUN_ALL_TESTS_MINS &
+"${IMPALA_HOME}/bin/script-timeout-check.sh" -timeout $TIMEOUT_FOR_RUN_ALL_TESTS_MINS \
+    -script_name "$(basename $0)" &
 TIMEOUT_PID=$!
 
 # IMPALA-3947: "Exhaustive" tests are actually based on workload. This
diff --git a/bin/script-timeout-check.sh b/bin/script-timeout-check.sh
new file mode 100755
index 0000000..ad5fd04
--- /dev/null
+++ b/bin/script-timeout-check.sh
@@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Helper script that checks every 60 sec if the timeout expired and on timeout prints
+# the stacktraces of all impalads (if running) and then finally kills the caller script.
+# Takes two required arguments:
+# -script_name : name of the caller script (only for debug messages / output filenames)
+# -timeout : the timeout in minutes
+#
+# The way to use it is:
+# ${IMPALA_HOME}/bin/script-timeout-check.sh -timeout {TIMEOUT} -script_name {NAME} &
+# TIMEOUT_PID=$!
+# ... body of script ...
+# # Kill the spawned timeout process and its child sleep process. There may not be
+# # a sleep process, so ignore failure.
+# pkill -P $TIMEOUT_PID || true
+# kill $TIMEOUT_PID
+
+SCRIPT_NAME=""
+SLEEP_TIMEOUT_MIN=""
+
+# Parse commandline options
+while [ -n "$*" ]
+do
+  case $1 in
+    -timeout)
+        SLEEP_TIMEOUT_MIN=${2-}
+        shift;
+        ;;
+    -script_name)
+        SCRIPT_NAME=${2-}
+        shift;
+        ;;
+    -help|-h|*)
+        echo "script-timeout-check.sh : aborts caller script if timeout expires"
+        echo "[-timeout] : The timeout in minutes (required)"
+        echo "[-script_name] : The name of the caller script (required)"
+        exit 1;
+        ;;
+  esac
+  shift;
+done
+
+if [ -z "$SLEEP_TIMEOUT_MIN" ]; then
+  echo "Must pass a -timeout flag with a valid timeout as an argument"
+  exit 1
+fi;
+
+if [ -z "$SCRIPT_NAME" ]; then
+  echo "Must pass a -script_name flag with an appropriate argument"
+  exit 1
+fi;
+
+SLEEP_TIMEOUT_S=$((${SLEEP_TIMEOUT_MIN} * 60))
+
+[[ $SLEEP_TIMEOUT_S < 1 ]] && exit
+
+echo
+echo
+echo "**** Timeout Timer Started (pid $$, ppid $PPID) for $SLEEP_TIMEOUT_S s! ****"
+echo
+echo
+
+# Check timer every 60 seconds and only proceed if the parent process is still alive.
+# Note: $SECONDS is a bash built-in that counts seconds since bash started.
+while ((SLEEP_TIMEOUT_S - SECONDS > 0)); do
+  sleep 1
+  if ! ps $PPID &> /dev/null; then
+    echo "Timeout Timer Exited because $SCRIPT_NAME PID $PPID is gone."
+    exit
+  fi
+done
+
+echo
+echo
+echo "**** ${SCRIPT_NAME} TIMED OUT! ****"
+echo
+echo
+
+# Impala might have a thread stuck. Print the stacktrace to the console output.
+mkdir -p "$IMPALA_TIMEOUT_LOGS_DIR"
+for pid in $(pgrep impalad); do
+  echo "**** Generating stacktrace of impalad with process id: $pid ****"
+  gdb -ex "thread apply all bt"  --batch -p $pid > "${IMPALA_TIMEOUT_LOGS_DIR}/${pid}.txt"
+done
+
+# Now kill the caller
+kill $PPID
+
+"${IMPALA_HOME}"/bin/generate_junitxml.py --step "${SCRIPT_NAME}" \
+--error "Script ${SCRIPT_NAME} timed out. This probably happened due to a hung
+thread which can be confirmed by looking at the stacktrace of running impalad
+processes at ${IMPALA_TIMEOUT_LOGS_DIR}"
+
+
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 6798567..6488851 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -43,6 +43,8 @@ setup_report_build_error
 : ${REMOTE_LOAD=}
 : ${CM_HOST=}
 : ${IMPALA_SERIAL_DATALOAD=}
+# We don't expect dataload to take more than 2.5 hours.
+: ${TIMEOUT_FOR_CREATE_LOAD_DATA_MINS:= 150}
 
 SKIP_METADATA_LOAD=0
 SKIP_SNAPSHOT_LOAD=0
@@ -90,6 +92,10 @@ do
     -skip_ranger)
       SKIP_RANGER=1
       ;;
+    -timeout)
+      TIMEOUT_FOR_CREATE_LOAD_DATA_MINS=${2-}
+      shift;
+      ;;
     -help|-h|*)
       echo "create-load-data.sh : Creates data and loads from scratch"
       echo "[-skip_metadata_load] : Skips loading of metadata"
@@ -97,6 +103,7 @@ do
       echo "[-snapshot_file] : Loads the test warehouse snapshot into hdfs"
       echo "[-cm_host] : Address of the Cloudera Manager host if loading to a remote cluster"
       echo "[-skip_ranger] : Skip the set-up for Ranger."
+      echo "[-timeout] : The timeout in minutes for loading data."
       exit 1;
       ;;
     esac
@@ -107,6 +114,10 @@ if [[ -n $REMOTE_LOAD ]]; then
   SKIP_RANGER=1
 fi
 
+"${IMPALA_HOME}/bin/script-timeout-check.sh" -timeout $TIMEOUT_FOR_CREATE_LOAD_DATA_MINS \
+    -script_name "$(basename $0)" &
+TIMEOUT_PID=$!
+
 if [[ $SKIP_METADATA_LOAD -eq 0  && "$SNAPSHOT_FILE" = "" ]]; then
   run-step "Generating HBase data" create-hbase.log \
       ${IMPALA_HOME}/testdata/bin/create-hbase.sh
@@ -704,3 +715,8 @@ fi
 # restarting the minicluster works and doesn't impact the tests. This is a common
 # operation for developers, so it is nice to test it.
 restart-cluster
+
+# Kill the spawned timeout process and its child sleep process.
+# There may not be a sleep process, so ignore failure.
+pkill -P $TIMEOUT_PID || true
+kill $TIMEOUT_PID