You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2020/07/13 01:05:35 UTC

[impala] branch master updated: IMPALA-9887: Add support for sharding end-to-end tests

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 605e301  IMPALA-9887: Add support for sharding end-to-end tests
605e301 is described below

commit 605e301739b8ef7619482db9b13444e84145b219
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Wed Jun 24 12:27:04 2020 -0700

    IMPALA-9887: Add support for sharding end-to-end tests
    
    ASAN maintains stacks for each allocation and free of memory. Impala
    sometimes allocates/frees memory from codegen'd code, so this means
    that the number of distinct stacks is unbounded. ASAN is storing
    these stacks in a hash table with a fixed number of buckets (one million).
    As the stacks accumulate, allocations and frees get slower and slower,
    because the lookup in this hashtable gets slower. This causes test
    execution time to degrade over time. Since backend tests and custom cluster
    tests don't have long running daemons, only the end to end tests are
    affected.
    
    This adds support for breaking end-to-end test execution into shards,
    restarting Impala between each shard. This uses the preexisting shard_tests
    pytest functionality introduced for the docker-based tests in IMPALA-6070.
    The number of shards is configurable via the EE_TEST_SHARDS environment
    variable. By default, EE_TEST_SHARDS=1 and no sharding is used.
    
    Without sharding, an ASAN core job takes about 16-17 hours. With 6 shards,
    it takes about 9 hours. It is recommended to always use sharding with ASAN.
    
    Testing:
     - Ran core job
     - Ran ASAN with EE_TEST_SHARDS=6
    
    Change-Id: I0bdbd79940df2bc7b951efdf0f044e6b40a3fda9
    Reviewed-on: http://gerrit.cloudera.org:8080/16155
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 bin/run-all-tests.sh | 43 +++++++++++++++++++++++++++++++++++++------
 tests/run-tests.py   | 31 +++++++++++++++++++++++++++----
 2 files changed, 64 insertions(+), 10 deletions(-)

diff --git a/bin/run-all-tests.sh b/bin/run-all-tests.sh
index 3a1f8b8..5287861 100755
--- a/bin/run-all-tests.sh
+++ b/bin/run-all-tests.sh
@@ -46,6 +46,7 @@ fi
 # Run End-to-end Tests
 : ${EE_TEST:=true}
 : ${EE_TEST_FILES:=}
+: ${EE_TEST_SHARDS:=1}
 # Run JDBC Test
 : ${JDBC_TEST:=true}
 # Run Cluster Tests
@@ -158,6 +159,8 @@ LOG_DIR="${IMPALA_EE_TEST_LOGS_DIR}"
 # Enable core dumps
 ulimit -c unlimited || true
 
+TEST_RET_CODE=0
+
 # Helper function to start Impala cluster.
 start_impala_cluster() {
   # TODO: IMPALA-9812: remove --unlock_mt_dop when it is no longer needed.
@@ -167,6 +170,21 @@ start_impala_cluster() {
       ${TEST_START_CLUSTER_ARGS} --impalad_args=--unlock_mt_dop=true
 }
 
+run_ee_tests() {
+  if [[ $# -gt 0 ]]; then
+    EXTRA_ARGS=${1}
+  else
+    EXTRA_ARGS=""
+  fi
+  # Run end-to-end tests.
+  # KERBEROS TODO - this will need to deal with ${KERB_ARGS}
+  if ! "${IMPALA_HOME}/tests/run-tests.py" ${COMMON_PYTEST_ARGS} \
+      ${RUN_TESTS_ARGS} ${EXTRA_ARGS} ${EE_TEST_FILES}; then
+    #${KERB_ARGS};
+    TEST_RET_CODE=1
+  fi
+}
+
 for i in $(seq 1 $NUM_TEST_ITERATIONS)
 do
   TEST_RET_CODE=0
@@ -231,12 +249,25 @@ do
   fi
 
   if [[ "$EE_TEST" == true ]]; then
-    # Run end-to-end tests.
-    # KERBEROS TODO - this will need to deal with ${KERB_ARGS}
-    if ! "${IMPALA_HOME}/tests/run-tests.py" ${COMMON_PYTEST_ARGS} \
-        ${RUN_TESTS_ARGS} ${EE_TEST_FILES}; then
-      #${KERB_ARGS};
-      TEST_RET_CODE=1
+    if [[ ${EE_TEST_SHARDS} -lt 2 ]]; then
+      # For runs without sharding, avoid adding the "--shard_tests" parameter.
+      # Some test frameworks (e.g. the docker-based tests) use this.
+      run_ee_tests
+    else
+      # When the EE tests are sharded, it runs 1/Nth of the tests at a time, restarting
+      # Impala between the shards. There are two benefits:
+      # 1. It isolates errors so that if Impala crashes, the next shards will still run
+      #    with a fresh Impala.
+      # 2. For ASAN runs, resources accumulate over test execution, so tests get slower
+      #    over time (see IMPALA-9887). Running shards with regular restarts
+      #    substantially speeds up execution time.
+      #
+      # Shards are 1 indexed (i.e. 1/N through N/N). This shards both serial and
+      # parallel tests.
+      for (( shard_idx=1 ; shard_idx <= ${EE_TEST_SHARDS} ; shard_idx++ )); do
+        run_ee_tests "--shard_tests=$shard_idx/${EE_TEST_SHARDS}"
+        start_impala_cluster
+      done
     fi
   fi
 
diff --git a/tests/run-tests.py b/tests/run-tests.py
index 55b002a..8f1e8d3 100755
--- a/tests/run-tests.py
+++ b/tests/run-tests.py
@@ -282,22 +282,44 @@ if __name__ == "__main__":
     run(sys.argv[1:])
   else:
     print_metrics('connections')
+
+    # If using sharding, it is useful to include it in the output filenames so that
+    # different shards don't overwrite each other. If not using sharding, use the
+    # normal filenames. This does not validate the shard_tests argument.
+    shard_identifier = ""
+    shard_arg = None
+    for idx, arg in enumerate(sys.argv):
+      # This deliberately does not stop at the first occurrence. It continues through
+      # all the arguments to find the last occurrence of shard_tests.
+      if arg == "--shard_tests":
+        # Form 1: --shard_tests N/M (space separation => grab next argument)
+        assert idx + 1 < len(sys.argv), "shard_args expects an argument"
+        shard_arg = sys.argv[idx + 1]
+      elif "--shard_tests=" in arg:
+        # Form 2: --shard_tests=N/M
+        shard_arg = arg.replace("--shard_tests=", "")
+
+    if shard_arg:
+      # The shard argument is "N/M" where N <= M. Convert to a string that can be used
+      # in a filename.
+      shard_identifier = "_shard_{0}".format(shard_arg.replace("/", "_"))
+
     # First run query tests that need to be executed serially
     if not skip_serial:
       base_args = ['-m', 'execute_serially']
-      run(base_args + build_test_args('serial'))
+      run(base_args + build_test_args("serial{0}".format(shard_identifier)))
       print_metrics('connections')
 
     # Run the stress tests tests
     if not skip_stress:
       base_args = ['-m', 'stress', '-n', NUM_STRESS_CLIENTS]
-      run(base_args + build_test_args('stress'))
+      run(base_args + build_test_args("stress{0}".format(shard_identifier)))
       print_metrics('connections')
 
     # Run the remaining query tests in parallel
     if not skip_parallel:
       base_args = ['-m', 'not execute_serially and not stress', '-n', NUM_CONCURRENT_TESTS]
-      run(base_args + build_test_args('parallel'))
+      run(base_args + build_test_args("parallel{0}".format(shard_identifier)))
 
     # The total number of tests executed at this point is expected to be >0
     # If it is < 0 then the script needs to exit with a non-zero
@@ -306,7 +328,8 @@ if __name__ == "__main__":
       sys.exit(1)
 
     # Finally, validate impalad/statestored metrics.
-    args = build_test_args(base_name='verify-metrics', valid_dirs=['verifiers'])
+    args = build_test_args(base_name="verify-metrics{0}".format(shard_identifier),
+                           valid_dirs=['verifiers'])
     args.append('verifiers/test_verify_metrics.py')
     run(args)