You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2019/04/03 20:51:03 UTC
[kudu] 04/04: build: adapt new Java flaky test infrastructure to existing controls

This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit c550fe8f55ee0a4c6838d246e00e541ef4cff460
Author: Adar Dembo <ad...@cloudera.com>
AuthorDate: Mon Apr 1 21:59:21 2019 -0700

    build: adapt new Java flaky test infrastructure to existing controls
    
    Now that Java tests are reporting success/failure, we can use the existing
    flaky test controls to drive it. As a refresher, the C++ tests rely on these
    environment variables:
    - RUN_FLAKY_ONLY: whether to run just flaky tests or all tests
    - KUDU_FLAKY_TEST_ATTEMPTS: number of attempts for flaky tests
    - KUDU_FLAKY_TEST_LIST: path to list of flaky tests, one on each line
    - KUDU_RETRY_ALL_FAILED_TESTS: whether to retry all tests or just the ones
                                   in the flaky test list
    
    The algorithm is roughly:
      if RUN_FLAKY_ONLY or KUDU_FLAKY_TEST_ATTEMPTS > 1:
        populate KUDU_FLAKY_TEST_LIST from test result server
    
      if RUN_FLAKY_ONLY:
        testset = tests listed in KUDU_FLAKY_TEST_LIST
      else:
        testset = all tests
    
      for t in testset:
        if KUDU_RETRY_ALL_FAILED_TESTS or (KUDU_FLAKY_TEST_LIST and
                                           t in KUDU_FLAKY_TEST_LIST):
          num_attempts = KUDU_FLAKY_TEST_ATTEMPTS (or 1 if unset)
        else:
          num_attempts = 1
    
        run t up to num_attempts times
    
    You can see it at work in build-and-test.sh/run-test.sh. You can also see it
    in dist-test.py though notably, it doesn't care about RUN_FLAKY_ONLY because
    we never used that particular combination (presumably the list of flaky
    tests is short enough that it wouldn't benefit from distributed testing).
    
    This patch attempts to mirror these exact semantics for Java tests. Here are
    the interesting changes:
    - In RetryRule, rerunFailingTestsCount is gone. The behavior is informed via
      the aforementioned environment variables instead.
    - In build-and-test.sh, if RUN_FLAKY_ONLY is set, parse the flaky test list
      into a series of --tests gradle command line arguments.
    - In dist-test.py, opt into the C++ flaky test handling (which reflects the
      above algorithm). There are also some small changes to flaky handling to
      accommodate Java's per-method flaky test tracking.
    
    Note: all of this assumes that there's no overlap between the names of any
    C++ or Java tests, which is currently true as all C++ tests have names like
    "tablet-test" or "master_cert_authority-itest" while all Java tests are
    prefixed with "org.apache.kudu...". If this were to change, we'd need to
    properly "namespace" the test results in the reporting infrastructure and
    fetch the flaky test lists separately for C++ and Java tests. For now
    there's just one flaky test list, and both ctest and gradle are OK with
    being asked to run irrelevant tests (they'll just be ignored).
    
    Change-Id: Ia89598d7eeb5ab642ab4ebb7aa583adcce770eae
    Reviewed-on: http://gerrit.cloudera.org:8080/12917
    Reviewed-by: Grant Henke <gr...@apache.org>
    Tested-by: Adar Dembo <ad...@cloudera.com>
---
 build-support/dist_test.py                         |  40 ++++++--
 build-support/jenkins/build-and-test.sh            |  31 +++++--
 java/gradle/tests.gradle                           |   3 -
 .../java/org/apache/kudu/test/junit/RetryRule.java | 102 ++++++++++++++++++---
 4 files changed, 145 insertions(+), 31 deletions(-)

diff --git a/build-support/dist_test.py b/build-support/dist_test.py
index 992da9c..2e207f5 100755
--- a/build-support/dist_test.py
+++ b/build-support/dist_test.py
@@ -70,13 +70,17 @@ GRADLE_FLAGS = os.environ.get('EXTRA_GRADLE_FLAGS', "")
 PATH_TO_REPO = "../"
 
 # Matches the command line listings in 'ctest -V -N'. For example:
-#   262: Test command: /src/kudu/build-support/run-test.sh "/src/kudu/build/debug/bin/jsonwriter-test"
+#  262: Test command: /src/kudu/build-support/run-test.sh "/src/kudu/build/debug/bin/jsonwriter-test"
 TEST_COMMAND_RE = re.compile('Test command: (.+)$')
 
 # Matches the environment variable listings in 'ctest -V -N'. For example:
 #  262:  GTEST_TOTAL_SHARDS=1
 TEST_ENV_RE = re.compile('^\d+:  (\S+)=(.+)')
 
+# Matches test names that have a shard suffix. For example:
+#  master-stress-test.8
+TEST_SHARD_RE = re.compile("\.\d+$")
+
 DEPS_FOR_ALL = \
     ["build-support/stacktrace_addr2line.pl",
      "build-support/run-test.sh",
@@ -347,6 +351,10 @@ def create_task_json(staging,
 
   If 'replicate_tasks' is higher than one, each .isolate file will be
   submitted multiple times. This can be useful for looping tests.
+
+  The test name is compared with the contents of 'flaky_test_set' to decide
+  how many times the execution service should retry the test on failure.
+  Alternatively, if 'retry_all_tests' is True, all tests will be retried.
   """
   tasks = []
   with file(staging.archive_dump_path(), "r") as isolate_dump:
@@ -356,9 +364,9 @@ def create_task_json(staging,
   # the dumped JSON. Others list it in an 'items' dictionary.
   items = inmap.get('items', inmap)
   for k, v in items.iteritems():
-    # The key is 'foo-test.<shard>'. So, chop off the last component
-    # to get the test name
-    test_name = ".".join(k.split(".")[:-1])
+    # The key may be 'foo-test.<shard>'. So, chop off the last component
+    # to get the test name.
+    test_name = ".".join(k.split(".")[:-1]) if TEST_SHARD_RE.search(k) else k
     max_retries = 0
     if test_name in flaky_test_set or retry_all_tests:
       max_retries = FLAKY_TEST_RETRIES
@@ -426,7 +434,20 @@ def get_flakies():
   path = os.getenv('KUDU_FLAKY_TEST_LIST')
   if not path:
     return set()
-  return set(l.strip() for l in file(path))
+
+  # dist-test can only retry tests on a per-class basis, but
+  # KUDU_FLAKY_TEST_LIST lists Java flakes on a per-method basis.
+  flaky_classes = []
+  with open(path) as f:
+    for l in f:
+      l = l.strip()
+      if '.' in l:
+        # "o.a.k.client.TestKuduClient.testFoo" -> "o.a.k.client.TestKuduClient"
+        flaky_classes.append('.'.join(l.split('.')[:-1]))
+      else:
+        flaky_classes.append(l)
+
+  return set(flaky_classes)
 
 def run_tests(parser, options):
   """
@@ -545,10 +566,11 @@ def run_java_tests(parser, options):
                         cwd=rel_to_abs("java"))
   staging = StagingDir(rel_to_abs("java/build/dist-test"))
   run_isolate(staging)
-  # TODO(ghenke): Add Java tests to the flaky dashboard
-  # KUDU_FLAKY_TEST_LIST doesn't included Java tests.
-  # Instead we will retry all Java tests in case they are flaky.
-  create_task_json(staging, 1, retry_all_tests=True)
+  retry_all = RETRY_ALL_TESTS > 0
+  create_task_json(staging,
+                   flaky_test_set=get_flakies(),
+                   replicate_tasks=1,
+                   retry_all_tests=retry_all)
   submit_tasks(staging, options)
 
 def loop_java_test(parser, options):
diff --git a/build-support/jenkins/build-and-test.sh b/build-support/jenkins/build-and-test.sh
index 23b20e4..4eaf556 100755
--- a/build-support/jenkins/build-and-test.sh
+++ b/build-support/jenkins/build-and-test.sh
@@ -223,6 +223,13 @@ if [ "$RUN_FLAKY_ONLY" == "1" -o "$KUDU_FLAKY_TEST_ATTEMPTS" -gt 1 ]; then
   fi
 
   if [ "$RUN_FLAKY_ONLY" == "1" ]; then
+    # TODO(adar): we can't yet pass the flaky test list into the dist-test
+    # machinery (in order to control which tests are executed).
+    if [ "$ENABLE_DIST_TEST" == "1" ]; then
+      echo "Distributed testing is incompatible with RUN_FLAKY_ONLY=1"
+      exit 1
+    fi
+
     test_regex=$(perl -e '
       chomp(my @lines = <>);
       print join("|", map { "^" . quotemeta($_) } @lines);
@@ -232,14 +239,25 @@ if [ "$RUN_FLAKY_ONLY" == "1" -o "$KUDU_FLAKY_TEST_ATTEMPTS" -gt 1 ]; then
       exit 0
     fi
 
+    # Set up ctest/gradle to run only those tests found in the flaky test list.
+    #
+    # Note: the flaky test list contains both C++ and Java tests and we pass it
+    # in its entirety to both ctest and gradle. This is safe because:
+    # 1. There are no test name collisions between C++ and Java tests.
+    # 2. Both ctest and gradle will happily ignore tests they can't find.
+    #
+    # If either of these assumptions changes, we'll need to explicitly split the
+    # test list into two lists, either here or in the test result server.
     EXTRA_TEST_FLAGS="$EXTRA_TEST_FLAGS -R $test_regex"
+    while IFS="" read t || [ -n "$t" ]
+    do
+      EXTRA_GRADLE_TEST_FLAGS="--tests $t $EXTRA_GRADLE_TEST_FLAGS"
+    done < $KUDU_FLAKY_TEST_LIST
 
-    # We don't support detecting java and python flaky tests at the moment.
-    echo "RUN_FLAKY_ONLY=1: running flaky tests only,"\
-         "disabling Java and python builds."
+    # We don't support detecting python flaky tests at the moment.
+    echo "RUN_FLAKY_ONLY=1: running flaky tests only, disabling python build."
     BUILD_PYTHON=0
     BUILD_PYTHON3=0
-    BUILD_JAVA=0
   elif [ "$KUDU_FLAKY_TEST_ATTEMPTS" -gt 1 ]; then
     echo Will retry the flaky tests up to $KUDU_FLAKY_TEST_ATTEMPTS times.
   fi
@@ -373,7 +391,7 @@ EXIT_STATUS=0
 FAILURES=""
 
 # If we're running distributed C++ tests, submit them asynchronously while
-# we run the Java and Python tests.
+# we run any local tests.
 if [ "$ENABLE_DIST_TEST" == "1" ]; then
   echo
   echo Submitting C++ distributed-test job.
@@ -424,7 +442,6 @@ if [ "$BUILD_JAVA" == "1" ]; then
   export EXTRA_GRADLE_FLAGS="--console=plain"
   EXTRA_GRADLE_FLAGS="$EXTRA_GRADLE_FLAGS --no-daemon"
   EXTRA_GRADLE_FLAGS="$EXTRA_GRADLE_FLAGS --continue"
-  EXTRA_GRADLE_FLAGS="$EXTRA_GRADLE_FLAGS -DrerunFailingTestsCount=3"
   # KUDU-2524: temporarily disable scalafmt until we can work out its JDK
   # incompatibility issue.
   EXTRA_GRADLE_FLAGS="$EXTRA_GRADLE_FLAGS -DskipFormat"
@@ -443,7 +460,7 @@ if [ "$BUILD_JAVA" == "1" ]; then
     fi
   else
     # TODO: Run `gradle check` in BUILD_TYPE DEBUG when static code analysis is fixed
-    if ! ./gradlew $EXTRA_GRADLE_FLAGS clean test ; then
+    if ! ./gradlew $EXTRA_GRADLE_FLAGS clean test $EXTRA_GRADLE_TEST_FLAGS; then
       TESTS_FAILED=1
       FAILURES="$FAILURES"$'Java Gradle build/test failed\n'
     fi
diff --git a/java/gradle/tests.gradle b/java/gradle/tests.gradle
index 2f43d96..2b541c4 100644
--- a/java/gradle/tests.gradle
+++ b/java/gradle/tests.gradle
@@ -62,9 +62,6 @@ tasks.withType(Test) {
   systemProperty "java.net.preferIPv4Stack", true
   systemProperty "java.security.egd", "file:/dev/urandom" // Improve RNG generation speed.
 
-  // Set rerunFailingTestsCount for use in BaseKuduTest.java to rerun failing tests.
-  systemProperty "rerunFailingTestsCount", propertyWithDefault("rerunFailingTestsCount", 0)
-
   // Set kuduBinDir to the binaries to use with the MiniKuduCluster.
   systemProperty "kuduBinDir", propertyWithDefault("kuduBinDir", "$project.rootDir/../build/latest/bin")
 
diff --git a/java/kudu-test-utils/src/main/java/org/apache/kudu/test/junit/RetryRule.java b/java/kudu-test-utils/src/main/java/org/apache/kudu/test/junit/RetryRule.java
index 31d8a95..a4611db 100644
--- a/java/kudu-test-utils/src/main/java/org/apache/kudu/test/junit/RetryRule.java
+++ b/java/kudu-test-utils/src/main/java/org/apache/kudu/test/junit/RetryRule.java
@@ -16,6 +16,8 @@
 // under the License.
 package org.apache.kudu.test.junit;
 
+import com.google.common.base.Preconditions;
+
 import org.apache.kudu.test.CapturingToFileLogAppender;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.apache.yetus.audience.InterfaceStability;
@@ -24,15 +26,23 @@ import org.junit.runner.Description;
 import org.junit.runners.model.Statement;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import java.io.BufferedReader;
 import java.io.Closeable;
 import java.io.File;
 import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.HashSet;
+import java.util.Set;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
 
 /**
  * JUnit rule to retry failed tests.
  *
- * The number of retries is controlled by the "rerunFailingTestsCount" system
- * property, mimicking Surefire in that regard.
+ * Uses the KUDU_FLAKY_TEST_LIST and KUDU_RETRY_ALL_FAILED_TESTS environment
+ * variables to determine whether a test should be retried, and the
+ * KUDU_FLAKY_TEST_ATTEMPTS environment variable to determine how many times.
  *
  * By default will use ResultReporter to report success/failure of each test
  * attempt to an external server; this may be skipped if desired.
@@ -40,44 +50,112 @@ import java.io.IOException;
 @InterfaceAudience.Private
 @InterfaceStability.Unstable
 public class RetryRule implements TestRule {
-
-  private static final String RETRY_PROP = "rerunFailingTestsCount";
-
   private static final Logger LOG = LoggerFactory.getLogger(RetryRule.class);
+  private static final int DEFAULT_RETRY_COUNT = 0;
+  private static final Set<String> FLAKY_TESTS = new HashSet<>();
 
   private final int retryCount;
   private final ResultReporter reporter;
 
+  static {
+    // Initialize the flaky test set if it exists. The file wil have one test
+    // name per line.
+    String value = System.getenv("KUDU_FLAKY_TEST_LIST");
+    if (value != null) {
+      try (BufferedReader br = Files.newBufferedReader(Paths.get(value), UTF_8)) {
+        for (String l = br.readLine(); l != null; l = br.readLine()) {
+          FLAKY_TESTS.add(l);
+        }
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
   public RetryRule() {
-    this(Integer.getInteger(RETRY_PROP, 0), /*skipReporting=*/ false);
+    this(DEFAULT_RETRY_COUNT, /*skipReporting=*/ false);
   }
 
   @InterfaceAudience.LimitedPrivate("Test")
   RetryRule(int retryCount, boolean skipReporting) {
+    Preconditions.checkArgument(retryCount >= 0);
     this.retryCount = retryCount;
     this.reporter = skipReporting ? null : new ResultReporter();
   }
 
+  private static boolean retryAllTests() {
+    String value = System.getenv("KUDU_RETRY_ALL_FAILED_TESTS");
+    return value != null && !value.isEmpty();
+  }
+
+  private static boolean retryThisTest(String humanReadableTestName) {
+    return FLAKY_TESTS.contains(humanReadableTestName);
+  }
+
+  private static int getActualRetryCount() {
+    String value = System.getenv("KUDU_FLAKY_TEST_ATTEMPTS");
+    if (value == null) {
+      return DEFAULT_RETRY_COUNT;
+    }
+    try {
+      int val = Integer.parseInt(value);
+      if (val < 1) {
+        throw new NumberFormatException(
+            String.format("expected non-zero positive value, got %d", val));
+      }
+
+      // Convert from number of "attempts" to number of "retries".
+      return Integer.parseInt(value) - 1;
+    } catch (NumberFormatException e) {
+      LOG.warn("Could not parse KUDU_FLAKY_TEST_ATTEMPTS, using default value ({})",
+               DEFAULT_RETRY_COUNT, e);
+      return DEFAULT_RETRY_COUNT;
+    }
+  }
+
   @Override
   public Statement apply(Statement base, Description description) {
-    return new RetryStatement(base, description, retryCount, reporter);
+    String humanReadableTestName =
+        description.getClassName() + "." + description.getMethodName();
+
+    // Retrying and reporting are independent; the RetryStatement is used if
+    // either is enabled. We'll retry the test under one of the following
+    // circumstances:
+    //
+    // 1. The RetryRule was constructed with an explicit retry count.
+    // 2. We've been asked to retry all tests via KUDU_RETRY_ALL_FAILED_TESTS.
+    // 3. We've been asked to retry this test via KUDU_FLAKY_TEST_LIST.
+    //
+    // In the latter two cases, we consult KUDU_FLAKY_TEST_ATTEMPTS for the retry count.
+    boolean retryExplicit = retryCount != DEFAULT_RETRY_COUNT;
+    boolean retryAll = retryAllTests();
+    boolean retryThis = retryThisTest(humanReadableTestName);
+    if (retryExplicit || retryAll || retryThis || reporter != null) {
+      int actualRetryCount = (retryAll || retryThis) ? getActualRetryCount() : retryCount;
+      LOG.info("Creating RetryStatement {} result reporter and retry count of {} ({})",
+               reporter != null ? "with" : "without",
+               actualRetryCount,
+               retryExplicit ? "explicit" :
+                 retryAll ? "all tests" :
+                   retryThis ? "this test" : "no retries");
+      return new RetryStatement(base, actualRetryCount, reporter, humanReadableTestName);
+    }
+    return base;
   }
 
   private static class RetryStatement extends Statement {
 
     private final Statement base;
-    private final Description description;
     private final int retryCount;
     private final ResultReporter reporter;
     private final String humanReadableTestName;
 
-    RetryStatement(Statement base, Description description,
-                   int retryCount, ResultReporter reporter) {
+    RetryStatement(Statement base, int retryCount, ResultReporter reporter,
+                   String humanReadableTestName) {
       this.base = base;
-      this.description = description;
       this.retryCount = retryCount;
       this.reporter = reporter;
-      this.humanReadableTestName = description.getClassName() + "." + description.getMethodName();
+      this.humanReadableTestName = humanReadableTestName;
     }
 
     private void report(ResultReporter.Result result, File logFile) {