You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by sl...@apache.org on 2016/07/28 14:34:59 UTC

[2/6] cassandra git commit: Rerun ReplicationAwareTokenAllocatorTest on failure to avoid flakiness.

Rerun ReplicationAwareTokenAllocatorTest on failure to avoid flakiness.

patch by Branimir Lambov; reviewed by Sylvain Lebresne for CASSANDRA-12277.


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/a5cbb026
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/a5cbb026
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/a5cbb026

Branch: refs/heads/cassandra-3.9
Commit: a5cbb026195ace450a514db22a64dc9c7c8fa66e
Parents: 3f9b9ff
Author: Branimir Lambov <br...@datastax.com>
Authored: Tue Jul 26 15:19:51 2016 +0300
Committer: Sylvain Lebresne <sy...@datastax.com>
Committed: Thu Jul 28 16:25:07 2016 +0200

----------------------------------------------------------------------
 CHANGES.txt                                     |  1 +
 .../ReplicationAwareTokenAllocatorTest.java     | 15 ++++++
 test/unit/org/apache/cassandra/Util.java        | 56 ++++++++++++++++++++
 3 files changed, 72 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/a5cbb026/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index caecefb..8f6997b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 3.0.9
+ * Rerun ReplicationAwareTokenAllocatorTest on failure to avoid flakiness (CASSANDRA-12277)
  * Exception when computing read-repair for range tombstones (CASSANDRA-12263)
  * Lost counter writes in compact table and static columns (CASSANDRA-12219)
  * AssertionError with MVs on updating a row that isn't indexed due to a null value (CASSANDRA-12247)

http://git-wip-us.apache.org/repos/asf/cassandra/blob/a5cbb026/test/long/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocatorTest.java
----------------------------------------------------------------------
diff --git a/test/long/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocatorTest.java b/test/long/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocatorTest.java
index 0a27e1c..1b36c55 100644
--- a/test/long/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocatorTest.java
+++ b/test/long/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocatorTest.java
@@ -29,6 +29,7 @@ import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
 
 import org.junit.Test;
 
+import org.apache.cassandra.Util;
 import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.dht.Token;
 
@@ -545,6 +546,20 @@ public class ReplicationAwareTokenAllocatorTest
     @Test
     public void testNewCluster()
     {
+        Util.flakyTest(this::flakyTestNewCluster,
+                       5,
+                       "It tends to fail sometimes due to the random selection of the tokens in the first few nodes.");
+    }
+
+    public void flakyTestNewCluster()
+    {
+        // This test is flaky because the selection of the tokens for the first RF nodes (which is random, with an
+        // uncontrolled seed) can sometimes cause a pathological situation where the algorithm will find a (close to)
+        // ideal distribution of tokens for some number of nodes, which in turn will inevitably cause it to go into a
+        // bad (unacceptable to the test criteria) distribution after adding one more node.
+
+        // This should happen very rarely, unless something is broken in the token allocation code.
+
         for (int rf = 2; rf <= 5; ++rf)
         {
             for (int perUnitCount = 1; perUnitCount <= MAX_VNODE_COUNT; perUnitCount *= 4)

http://git-wip-us.apache.org/repos/asf/cassandra/blob/a5cbb026/test/unit/org/apache/cassandra/Util.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java
index 7ce8f04..e7b1ffa 100644
--- a/test/unit/org/apache/cassandra/Util.java
+++ b/test/unit/org/apache/cassandra/Util.java
@@ -536,6 +536,62 @@ public class Util
         thread.join(10000);
     }
 
+    public static AssertionError runCatchingAssertionError(Runnable test)
+    {
+        try
+        {
+            test.run();
+            return null;
+        }
+        catch (AssertionError e)
+        {
+            return e;
+        }
+    }
+
+    /**
+     * Wrapper function used to run a test that can sometimes flake for uncontrollable reasons.
+     *
+     * If the given test fails on the first run, it is executed the given number of times again, expecting all secondary
+     * runs to succeed. If they do, the failure is understood as a flake and the test is treated as passing.
+     *
+     * Do not use this if the test is deterministic and its success is not influenced by external factors (such as time,
+     * selection of random seed, network failures, etc.). If the test can be made independent of such factors, it is
+     * probably preferable to do so rather than use this method.
+     *
+     * @param test The test to run.
+     * @param rerunsOnFailure How many times to re-run it if it fails. All reruns must pass.
+     * @param message Message to send to System.err on initial failure.
+     */
+    public static void flakyTest(Runnable test, int rerunsOnFailure, String message)
+    {
+        AssertionError e = runCatchingAssertionError(test);
+        if (e == null)
+            return;     // success
+        System.err.format("Test failed. %s%n"
+                        + "Re-running %d times to verify it isn't failing more often than it should.%n"
+                        + "Failure was: %s%n", message, rerunsOnFailure, e);
+        e.printStackTrace();
+
+        int rerunsFailed = 0;
+        for (int i = 0; i < rerunsOnFailure; ++i)
+        {
+            AssertionError t = runCatchingAssertionError(test);
+            if (t != null)
+            {
+                ++rerunsFailed;
+                e.addSuppressed(t);
+            }
+        }
+        if (rerunsFailed > 0)
+        {
+            System.err.format("Test failed in %d of the %d reruns.%n", rerunsFailed, rerunsOnFailure);
+            throw e;
+        }
+
+        System.err.println("All reruns succeeded. Failure treated as flake.");
+    }
+
     // for use with Optional in tests, can be used as an argument to orElseThrow
     public static Supplier<AssertionError> throwAssert(final String message)
     {