You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2018/11/29 18:19:17 UTC

[08/16] lucene-solr:master: SOLR-12801: Make massive improvements to the tests.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java
index 8e66b1e..0318b1e 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java
@@ -39,6 +39,7 @@ import org.apache.http.client.methods.HttpGet;
 import org.apache.http.client.methods.HttpPost;
 import org.apache.http.entity.StringEntity;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
 import org.apache.lucene.util.LuceneTestCase.Slow;
 import org.apache.solr.client.solrj.SolrClient;
 import org.apache.solr.client.solrj.SolrQuery;
@@ -75,6 +76,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 @Slow
+@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12313")
 public class TestTlogReplica extends SolrCloudTestCase {
   
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@@ -83,7 +85,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
   private final static int REPLICATION_TIMEOUT_SECS = 10;
   
   private String suggestedCollectionName() {
-    return (getTestClass().getSimpleName().replace("Test", "") + "_" + getTestName().split(" ")[0]).replaceAll("(.)(\\p{Upper})", "$1_$2").toLowerCase(Locale.ROOT);
+    return (getTestClass().getSimpleName().replace("Test", "") + "_" + getSaferTestName().split(" ")[0]).replaceAll("(.)(\\p{Upper})", "$1_$2").toLowerCase(Locale.ROOT);
   }
 
   @BeforeClass
@@ -116,7 +118,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
     for (JettySolrRunner jetty:cluster.getJettySolrRunners()) {
       if (!jetty.isRunning()) {
         log.warn("Jetty {} not running, probably some bad test. Starting it", jetty.getLocalPort());
-        ChaosMonkey.start(jetty);
+        jetty.start();
       }
     }
     if (cluster.getSolrClient().getZkStateReader().getClusterState().getCollectionOrNull(collectionName) != null) {
@@ -156,6 +158,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
           CollectionAdminRequest.createCollection(collectionName, "conf", 2, 0, 4, 0)
           .setMaxShardsPerNode(100)
           .process(cluster.getSolrClient());
+          cluster.waitForActiveCollection(collectionName, 2, 8);
           break;
         case 1:
           // Sometimes don't use SolrJ
@@ -168,6 +171,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
           HttpGet createCollectionGet = new HttpGet(url);
           HttpResponse httpResponse = cluster.getSolrClient().getHttpClient().execute(createCollectionGet);
           assertEquals(200, httpResponse.getStatusLine().getStatusCode());
+          cluster.waitForActiveCollection(collectionName, 2, 8);
           break;
         case 2:
           // Sometimes use V2 API
@@ -182,6 +186,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
           createCollectionPost.setEntity(new StringEntity(requestBody));
           httpResponse = cluster.getSolrClient().getHttpClient().execute(createCollectionPost);
           assertEquals(200, httpResponse.getStatusLine().getStatusCode());
+          cluster.waitForActiveCollection(collectionName, 2, 8);
           break;
       }
       
@@ -213,6 +218,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
           CollectionAdminResponse response = CollectionAdminRequest.reloadCollection(collectionName)
           .process(cluster.getSolrClient());
           assertEquals(0, response.getStatus());
+          waitForState("failed waiting for active colletion", collectionName, clusterShape(2, 4));
           reloaded = true;
         }
       }
@@ -273,7 +279,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
     addReplicaToShard("shard2", Replica.Type.TLOG);
     docCollection = assertNumberOfReplicas(0, 4, 0, true, false);
     
-    waitForState("Expecting collection to have 2 shards and 2 replica each", collectionName, clusterShape(2, 2));
+    waitForState("Expecting collection to have 2 shards and 2 replica each", collectionName, clusterShape(2, 4));
     
     //Delete tlog replica from shard1
     CollectionAdminRequest.deleteReplica(
@@ -395,7 +401,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
       .process(cluster.getSolrClient());
     } else {
       leaderJetty = cluster.getReplicaJetty(s.getLeader());
-      ChaosMonkey.kill(leaderJetty);
+      leaderJetty.stop();
       waitForState("Leader replica not removed", collectionName, clusterShape(1, 1));
       // Wait for cluster state to be updated
       waitForState("Replica state not updated in cluster state", 
@@ -425,7 +431,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
     if (removeReplica) {
       CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.TLOG).process(cluster.getSolrClient());
     } else {
-      ChaosMonkey.start(leaderJetty);
+      leaderJetty.stop();
     }
     waitForState("Expected collection to be 1x2", collectionName, clusterShape(1, 2));
     // added replica should replicate from the leader
@@ -441,7 +447,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
     waitForNumDocsInAllActiveReplicas(1);
     
     JettySolrRunner pullReplicaJetty = cluster.getReplicaJetty(docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.TLOG)).get(0));
-    ChaosMonkey.kill(pullReplicaJetty);
+    pullReplicaJetty.stop();
     waitForState("Replica not removed", collectionName, activeReplicaCount(0, 1, 0));
 //    // Also wait for the replica to be placed in state="down"
 //    waitForState("Didn't update state", collectionName, clusterStateReflectsActiveAndDownReplicas());
@@ -450,7 +456,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
     cluster.getSolrClient().commit(collectionName);
     waitForNumDocsInAllActiveReplicas(2);
     
-    ChaosMonkey.start(pullReplicaJetty);
+    pullReplicaJetty.stop();
     waitForState("Replica not added", collectionName, activeReplicaCount(0, 2, 0));
     waitForNumDocsInAllActiveReplicas(2);
   }
@@ -538,15 +544,15 @@ public class TestTlogReplica extends SolrCloudTestCase {
         .process(cloudClient, collectionName);
     JettySolrRunner solrRunner = getSolrRunner(false).get(0);
     if (useKill) { 
-      ChaosMonkey.kill(solrRunner);
+      solrRunner.stop();
     } else {
-      ChaosMonkey.stop(solrRunner);
+      solrRunner.stop();
     }
     waitForState("Replica still up", collectionName, activeReplicaCount(0,1,0));
     new UpdateRequest()
         .add(sdoc("id", "6"))
         .process(cloudClient, collectionName);
-    ChaosMonkey.start(solrRunner);
+    solrRunner.stop();
     waitForState("Replica didn't recover", collectionName, activeReplicaCount(0,2,0));
     // We skip peerSync, so replica will always trigger commit on leader
     // We query only the non-leader replicas, since we haven't opened a new searcher on the leader yet
@@ -566,10 +572,10 @@ public class TestTlogReplica extends SolrCloudTestCase {
     }
     checkRTG(3,7, cluster.getJettySolrRunners());
     DirectUpdateHandler2.commitOnClose = false;
-    ChaosMonkey.stop(solrRunner);
+    solrRunner.stop();
     waitForState("Replica still up", collectionName, activeReplicaCount(0,1,0));
     DirectUpdateHandler2.commitOnClose = true;
-    ChaosMonkey.start(solrRunner);
+    solrRunner.stop();
     waitForState("Replica didn't recover", collectionName, activeReplicaCount(0,2,0));
     waitForNumDocsInAllReplicas(5, getNonLeaderReplias(collectionName), 10); //timeout for stale collection state
     checkRTG(3,7, cluster.getJettySolrRunners());
@@ -588,11 +594,11 @@ public class TestTlogReplica extends SolrCloudTestCase {
       }
     };
     if (useKill) { 
-      ChaosMonkey.kill(solrRunner);
+      solrRunner.stop();
     } else {
-      ChaosMonkey.stop(solrRunner);
+      solrRunner.stop();
     }
-    ChaosMonkey.start(solrRunner);
+    solrRunner.stop();
     waitingForReplay.acquire();
     // If I add the doc immediately, the leader fails to communicate with the follower with broken pipe.
     // Options are, wait or retry...
@@ -660,13 +666,13 @@ public class TestTlogReplica extends SolrCloudTestCase {
         .add(sdoc("id", "2"))
         .process(cloudClient, collectionName);
     JettySolrRunner oldLeaderJetty = getSolrRunner(true).get(0);
-    ChaosMonkey.kill(oldLeaderJetty);
+    oldLeaderJetty.stop();
     waitForState("Replica not removed", collectionName, activeReplicaCount(0, 1, 0));
     new UpdateRequest()
         .add(sdoc("id", "3"))
         .add(sdoc("id", "4"))
         .process(cloudClient, collectionName);
-    ChaosMonkey.start(oldLeaderJetty);
+    oldLeaderJetty.stop();
     waitForState("Replica not added", collectionName, activeReplicaCount(0, 2, 0));
     checkRTG(1,4, cluster.getJettySolrRunners());
     new UpdateRequest()
@@ -692,7 +698,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
     }
     JettySolrRunner oldLeaderJetty = getSolrRunner(true).get(0);
     String oldLeaderNodeName = oldLeaderJetty.getNodeName();
-    ChaosMonkey.kill(oldLeaderJetty);
+    oldLeaderJetty.stop();
     waitForState("Replica not removed", collectionName, activeReplicaCount(0, 1, 0));
     waitForState("Expect new leader", collectionName,
         (liveNodes, collectionState) -> {
@@ -701,7 +707,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
           return !leader.getNodeName().equals(oldLeaderNodeName);
         }
     );
-    ChaosMonkey.start(oldLeaderJetty);
+    oldLeaderJetty.stop();
     waitForState("Replica not added", collectionName, activeReplicaCount(0, 2, 0));
     checkRTG(1,1, cluster.getJettySolrRunners());
     SolrDocument doc = cluster.getSolrClient().getById(collectionName,"1");
@@ -748,7 +754,7 @@ public class TestTlogReplica extends SolrCloudTestCase {
     .process(cluster.getSolrClient());
     int numReplicasPerShard = numNrtReplicas + numTlogReplicas + numPullReplicas;
     waitForState("Expected collection to be created with " + numShards + " shards and  " + numReplicasPerShard + " replicas",
-        collectionName, clusterShape(numShards, numReplicasPerShard));
+        collectionName, clusterShape(numShards, numShards * numReplicasPerShard));
     return assertNumberOfReplicas(numNrtReplicas*numShards, numTlogReplicas*numShards, numPullReplicas*numShards, false, true);
   }
   

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
index 87bab84..0fe45c9 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
@@ -110,7 +110,6 @@ public class TestTolerantUpdateProcessorCloud extends SolrCloudTestCase {
     configureCluster(NUM_SERVERS)
       .addConfig(configName, configDir.toPath())
       .configure();
-    assertSpinLoopAllJettyAreRunning(cluster);
 
     CLOUD_CLIENT = cluster.getSolrClient();
     CLOUD_CLIENT.setDefaultCollection(COLLECTION_NAME);
@@ -120,10 +119,9 @@ public class TestTolerantUpdateProcessorCloud extends SolrCloudTestCase {
         .withProperty("schema", "schema15.xml") // string id for doc routing prefix
         .process(CLOUD_CLIENT);
     
-    ZkStateReader zkStateReader = CLOUD_CLIENT.getZkStateReader();
-    AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION_NAME, zkStateReader, true, true, 330);
-
+    cluster.waitForActiveCollection(COLLECTION_NAME, NUM_SHARDS, REPLICATION_FACTOR * NUM_SHARDS);
 
+    ZkStateReader zkStateReader = CLOUD_CLIENT.getZkStateReader();
     // really hackish way to get a URL for specific nodes based on shard/replica hosting
     // inspired by TestMiniSolrCloudCluster
     HashMap<String, String> urlMap = new HashMap<>();
@@ -922,40 +920,6 @@ public class TestTolerantUpdateProcessorCloud extends SolrCloudTestCase {
     assertQueryDocIds(client, false, docId21, docId22);
                       
   }
-
-  /**
-   * HACK: Loops over every Jetty instance in the specified MiniSolrCloudCluster to see if they are running,
-   * and sleeps small increments until they all report that they are, or a max num iters is reached
-   * 
-   * (work around for SOLR-8862.  Maybe something like this should be promoted into MiniSolrCloudCluster's 
-   * start() method? or SolrCloudTestCase's configureCluster?)
-   */
-  public static void assertSpinLoopAllJettyAreRunning(MiniSolrCloudCluster cluster) throws InterruptedException {
-    // NOTE: idealy we could use an ExecutorService that tried to open Sockets (with a long timeout)
-    // to each of the jetty instances in parallel w/o any sleeping -- but since they pick their ports
-    // dynamically and don't report them until/unless the server is up, that won't neccessarily do us
-    // any good.
-    final int numServers = cluster.getJettySolrRunners().size();
-    int numRunning = 0;
-    for (int i = 5; 0 <= i; i--) {
-      numRunning = 0;
-      for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
-        if (jetty.isRunning()) {
-          numRunning++;
-        }
-      }
-      if (numServers == numRunning) {
-        return;
-      } else if (0 == i) {
-        // give up
-        break;
-      }
-      // the more nodes we're waiting on, the longer we should try to sleep (within reason)
-      Thread.sleep(Math.min((numServers - numRunning) * 100, 1000));
-    }
-    assertEquals("giving up waiting for all jetty instances to be running",
-                 numServers, numRunning);
-  }
   
   /** Asserts that the UpdateResponse contains the specified expectedErrs and no others */
   public static void assertUpdateTolerantErrors(String assertionMsgPrefix,

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java
index c60c22b..ef07a77 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java
@@ -41,7 +41,6 @@ import org.apache.solr.cloud.TestTolerantUpdateProcessorCloud.ExpectedErr;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.SolrInputField;
-import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.SolrParams;
 import org.junit.AfterClass;
 import org.junit.Before;
@@ -96,8 +95,6 @@ public class TestTolerantUpdateProcessorRandomCloud extends SolrCloudTestCase {
     configureCluster(numServers)
       .addConfig(configName, configDir.toPath())
       .configure();
-
-    TestTolerantUpdateProcessorCloud.assertSpinLoopAllJettyAreRunning(cluster);
     
     Map<String, String> collectionProperties = new HashMap<>();
     collectionProperties.put("config", "solrconfig-distrib-update-processor-chains.xml");
@@ -110,6 +107,8 @@ public class TestTolerantUpdateProcessorRandomCloud extends SolrCloudTestCase {
         .setProperties(collectionProperties)
         .process(CLOUD_CLIENT);
 
+    cluster.waitForActiveCollection(COLLECTION_NAME, numShards, numShards * repFactor);
+    
     if (NODE_CLIENTS != null) {
       for (HttpSolrClient client : NODE_CLIENTS) {
         client.close();
@@ -123,9 +122,6 @@ public class TestTolerantUpdateProcessorRandomCloud extends SolrCloudTestCase {
     }
     assertEquals(numServers, NODE_CLIENTS.size());
     
-    ZkStateReader zkStateReader = CLOUD_CLIENT.getZkStateReader();
-    AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION_NAME, zkStateReader, true, true, 330);
-    
   }
   
   @Before

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/TestUtilizeNode.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestUtilizeNode.java b/solr/core/src/test/org/apache/solr/cloud/TestUtilizeNode.java
index 18ac662..5a28211 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestUtilizeNode.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestUtilizeNode.java
@@ -73,7 +73,6 @@ public class TestUtilizeNode extends SolrCloudTestCase {
 
   @Test
   public void test() throws Exception {
-    cluster.waitForAllNodes(5000);
     int REPLICATION = 2;
     String coll = "utilizenodecoll";
     CloudSolrClient cloudClient = cluster.getSolrClient();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/TestWithCollection.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestWithCollection.java b/solr/core/src/test/org/apache/solr/cloud/TestWithCollection.java
index 52e659a..15a32da 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestWithCollection.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestWithCollection.java
@@ -17,11 +17,13 @@
 
 package org.apache.solr.cloud;
 
+import static org.apache.solr.cloud.autoscaling.AutoScalingHandlerTest.createAutoScalingRequest;
+import static org.apache.solr.common.params.CollectionAdminParams.WITH_COLLECTION;
+
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.nio.charset.StandardCharsets;
 import java.util.List;
-import java.util.Map;
 import java.util.Optional;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
@@ -39,21 +41,18 @@ import org.apache.solr.cloud.autoscaling.ComputePlanAction;
 import org.apache.solr.cloud.autoscaling.ExecutePlanAction;
 import org.apache.solr.cloud.autoscaling.TriggerActionBase;
 import org.apache.solr.cloud.autoscaling.TriggerEvent;
-import org.apache.solr.common.cloud.ClusterState;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.TimeSource;
 import org.apache.solr.util.LogLevel;
 import org.apache.solr.util.TimeOut;
-import org.junit.BeforeClass;
+import org.junit.After;
+import org.junit.Before;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import static org.apache.solr.cloud.autoscaling.AutoScalingHandlerTest.createAutoScalingRequest;
-import static org.apache.solr.common.params.CollectionAdminParams.WITH_COLLECTION;
-
 /**
  * Tests for co-locating a collection with another collection such that any Collection API
  * always ensures that the co-location is never broken.
@@ -68,30 +67,16 @@ public class TestWithCollection extends SolrCloudTestCase {
 
   private static final int NUM_JETTIES = 2;
 
-  @BeforeClass
-  public static void setupCluster() throws Exception {
+  @Before
+  public void setupCluster() throws Exception {
     configureCluster(NUM_JETTIES)
         .addConfig("conf", configset("cloud-minimal"))
         .configure();
-  }
 
-  @Override
-  public void setUp() throws Exception {
-    super.setUp();
     if (zkClient().exists(ZkStateReader.SOLR_AUTOSCALING_CONF_PATH, true))  {
       zkClient().setData(ZkStateReader.SOLR_AUTOSCALING_CONF_PATH, "{}".getBytes(StandardCharsets.UTF_8), true);
     }
-    ClusterState clusterState = cluster.getSolrClient().getZkStateReader().getClusterState();
-    for (Map.Entry<String, ClusterState.CollectionRef> entry : clusterState.getCollectionStates().entrySet()) {
-      if (entry.getKey().contains("_xyz"))  {
-        try {
-          CollectionAdminRequest.deleteCollection(entry.getKey()).process(cluster.getSolrClient());
-        } catch (Exception e) {
-          log.error("Exception while deleting collection: " + entry.getKey());
-        }
-      }
-    }
-    cluster.deleteAllCollections();
+
     cluster.getSolrClient().setDefaultCollection(null);
 
     cloudManager = cluster.getJettySolrRunner(0).getCoreContainer().getZkController().getSolrCloudManager();
@@ -100,18 +85,11 @@ public class TestWithCollection extends SolrCloudTestCase {
     deleteChildrenRecursively(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH);
     deleteChildrenRecursively(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH);
     LATCH = new CountDownLatch(1);
-
-    int jettys = cluster.getJettySolrRunners().size();
-    if (jettys < NUM_JETTIES) {
-      for (int i = jettys; i < NUM_JETTIES; i++) {
-        cluster.startJettySolrRunner();
-      }
-    } else  {
-      for (int i = jettys; i > NUM_JETTIES; i--) {
-        cluster.stopJettySolrRunner(i - 1);
-      }
-    }
-    cluster.waitForAllNodes(30);
+  }
+  
+  @After
+  public void teardownCluster() throws Exception {
+    shutdownCluster();
   }
 
   private void deleteChildrenRecursively(String path) throws Exception {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/TlogReplayBufferedWhileIndexingTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TlogReplayBufferedWhileIndexingTest.java b/solr/core/src/test/org/apache/solr/cloud/TlogReplayBufferedWhileIndexingTest.java
index 027c7fa..18eabc2 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TlogReplayBufferedWhileIndexingTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TlogReplayBufferedWhileIndexingTest.java
@@ -22,7 +22,6 @@ import java.util.List;
 
 import org.apache.lucene.util.LuceneTestCase.Nightly;
 import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
-import org.apache.solr.SolrTestCaseJ4.SuppressObjectReleaseTracker;
 import org.apache.lucene.util.LuceneTestCase.Slow;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.embedded.JettySolrRunner;
@@ -35,7 +34,6 @@ import org.junit.Test;
 @Slow
 @Nightly
 @SuppressSSL
-@SuppressObjectReleaseTracker(bugUrl="Testing purposes")
 public class TlogReplayBufferedWhileIndexingTest extends AbstractFullDistribZkTestBase {
 
   private List<StoppableIndexingThread> threads;
@@ -79,7 +77,7 @@ public class TlogReplayBufferedWhileIndexingTest extends AbstractFullDistribZkTe
     allJetty.addAll(jettys);
     allJetty.remove(shardToLeaderJetty.get("shard1").jetty);
     assert allJetty.size() == 1 : allJetty.size();
-    ChaosMonkey.stop(allJetty.get(0));
+    allJetty.get(0).stop();
     
     StoppableIndexingThread indexThread;
     for (int i = 0; i < numThreads; i++) {
@@ -92,7 +90,7 @@ public class TlogReplayBufferedWhileIndexingTest extends AbstractFullDistribZkTe
 
     Thread.sleep(2000);
     
-    ChaosMonkey.start(allJetty.get(0));
+    allJetty.get(0).start();
     
     Thread.sleep(45000);
   

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/VMParamsZkACLAndCredentialsProvidersTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/VMParamsZkACLAndCredentialsProvidersTest.java b/solr/core/src/test/org/apache/solr/cloud/VMParamsZkACLAndCredentialsProvidersTest.java
index 95422fa..36fb989 100644
--- a/solr/core/src/test/org/apache/solr/cloud/VMParamsZkACLAndCredentialsProvidersTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/VMParamsZkACLAndCredentialsProvidersTest.java
@@ -63,7 +63,7 @@ public class VMParamsZkACLAndCredentialsProvidersTest extends SolrTestCaseJ4 {
         + "zookeeper/server1/data";
     log.info("ZooKeeper dataDir:" + zkDir);
     zkServer = new ZkTestServer(zkDir);
-    zkServer.run();
+    zkServer.run(false);
     
     System.setProperty("zkHost", zkServer.getZkAddress());
     
@@ -194,7 +194,10 @@ public class VMParamsZkACLAndCredentialsProvidersTest extends SolrTestCaseJ4 {
         zkClient.delete(path + "/subnode", -1, false);
       }
     } catch (NoAuthException nae) {
-      if (create) fail("No NoAuthException expected");
+      if (create) {
+        nae.printStackTrace();
+        fail("No NoAuthException expected");
+      }
       // expected
     }
     

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java b/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
index 5578452..45c4812 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
@@ -93,9 +93,6 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
     try {
       server.run();
 
-      AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
-      AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
-
       try (SolrZkClient client = new SolrZkClient(server.getZkAddress(), TIMEOUT)) {
 
         ZkController.createClusterZkNodes(client);
@@ -176,9 +173,6 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
     try {
       server.run();
 
-      AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
-      AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
-
       SolrZkClient zkClient = new SolrZkClient(server.getZkAddress(), TIMEOUT);
       String actualConfigName = "firstConfig";
 
@@ -228,9 +222,6 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
     try {
       server.run();
 
-      AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
-      AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
-
       cc = getCoreContainer();
       ZkController zkController = null;
 
@@ -282,9 +273,6 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
     try {
       server.run();
 
-      AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
-      AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
-
       cc = new MockCoreContainer()  {
         @Override
         public List<CoreDescriptor> getCoreDescriptors() {
@@ -336,8 +324,8 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
         zkController.getZkStateReader().forciblyRefreshAllClusterStateSlow();
 
         long now = System.nanoTime();
-        long timeout = now + TimeUnit.NANOSECONDS.convert(ZkController.WAIT_DOWN_STATES_TIMEOUT_SECONDS, TimeUnit.SECONDS);
-        zkController.publishAndWaitForDownStates();
+        long timeout = now + TimeUnit.NANOSECONDS.convert(5, TimeUnit.SECONDS);
+        zkController.publishAndWaitForDownStates(5);
         assertTrue("The ZkController.publishAndWaitForDownStates should have timed out but it didn't", System.nanoTime() >= timeout);
       } finally {
         if (zkController != null)

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java b/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java
index 42d99f8..39f1810 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java
@@ -40,16 +40,22 @@ public class ZkFailoverTest extends SolrCloudTestCase {
   }
 
   @AfterClass
-  public static void cleanUp() {
+  public static void cleanUp() throws Exception {
     System.clearProperty("waitForZk");
+
+    for (int i = 0; i < cluster.getJettySolrRunners().size(); i++) {
+      final JettySolrRunner runner = cluster.getJettySolrRunner(i);
+      runner.stop();
+    }
   }
 
   public void testRestartZkWhenClusterDown() throws Exception {
     String coll = "coll1";
     CollectionAdminRequest.createCollection(coll, 2, 1).process(cluster.getSolrClient());
+    cluster.waitForActiveCollection(coll, 2, 2);
     cluster.getSolrClient().add(coll, new SolrInputDocument("id", "1"));
     for (JettySolrRunner runner : cluster.getJettySolrRunners()) {
-      ChaosMonkey.stop(runner);
+      runner.stop();
     }
     ZkTestServer zkTestServer = cluster.getZkServer();
     zkTestServer.shutdown();
@@ -58,7 +64,7 @@ public class ZkFailoverTest extends SolrCloudTestCase {
       final JettySolrRunner runner = cluster.getJettySolrRunner(i);
       threads[i] = new Thread(() -> {
         try {
-          ChaosMonkey.start(runner);
+          runner.start();
         } catch (Exception e) {
           e.printStackTrace();
         }
@@ -67,12 +73,12 @@ public class ZkFailoverTest extends SolrCloudTestCase {
     }
     Thread.sleep(5000);
     zkTestServer = new ZkTestServer(zkTestServer.getZkDir(), zkTestServer.getPort());
-    zkTestServer.run();
+    zkTestServer.run(false);
     for (Thread thread : threads) {
       thread.join();
     }
     waitForLiveNodes(2);
-    waitForState("Timeout waiting for " + coll, coll, clusterShape(2, 1));
+    waitForState("Timeout waiting for " + coll, coll, clusterShape(2, 2));
     QueryResponse rsp = new QueryRequest(new SolrQuery("*:*")).process(cluster.getSolrClient(), coll);
     assertEquals(1, rsp.getResults().getNumFound());
     zkTestServer.shutdown();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/ZkSolrClientTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkSolrClientTest.java b/solr/core/src/test/org/apache/solr/cloud/ZkSolrClientTest.java
index 120457c..276a04c 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ZkSolrClientTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ZkSolrClientTest.java
@@ -21,6 +21,7 @@ import java.util.Set;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
+
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkCmdExecutor;
@@ -53,9 +54,6 @@ public class ZkSolrClientTest extends SolrTestCaseJ4 {
       server = new ZkTestServer(zkDir);
       server.run();
 
-      AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
-      if (makeRoot) AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
-
       zkClient = new SolrZkClient(server.getZkAddress(), AbstractZkTestCase.TIMEOUT);
     }
 
@@ -109,45 +107,59 @@ public class ZkSolrClientTest extends SolrTestCaseJ4 {
   public void testReconnect() throws Exception {
     String zkDir = createTempDir("zkData").toFile().getAbsolutePath();
     ZkTestServer server = null;
-    SolrZkClient zkClient = null;
-    try {
-      server = new ZkTestServer(zkDir);
-      server.run();
-      AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
-      AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
+    server = new ZkTestServer(zkDir);
+    server.run();
+    try (SolrZkClient zkClient = new SolrZkClient(server.getZkAddress(), AbstractZkTestCase.TIMEOUT);) {
 
-      final SolrZkClient zkClientConLoss = new SolrZkClient(server.getZkAddress(), AbstractZkTestCase.TIMEOUT);
-      zkClient = zkClientConLoss;
       String shardsPath = "/collections/collection1/shards";
       zkClient.makePath(shardsPath, false, true);
 
-      zkClient.makePath("collections/collection1", false, true);
       int zkServerPort = server.getPort();
       // this tests disconnect state
       server.shutdown();
 
       Thread.sleep(80);
 
+      Thread thread = new Thread() {
+        public void run() {
+          try {
+            zkClient.makePath("collections/collection2", false);
+           // Assert.fail("Server should be down here");
+          } catch (KeeperException | InterruptedException e) {
 
-      expectThrows(KeeperException.class,
-          "Server should be down",
-          () -> zkClientConLoss.makePath("collections/collection2", false)
-      );
+          }
+        }
+      };
+
+      thread.start();
 
       // bring server back up
       server = new ZkTestServer(zkDir, zkServerPort);
-      server.run();
+      server.run(false);
 
       // TODO: can we do better?
       // wait for reconnect
       Thread.sleep(600);
 
-      try {
-        zkClient.makePath("collections/collection3", true);
-      } catch (KeeperException.ConnectionLossException e) {
-        Thread.sleep(5000); // try again in a bit
-        zkClient.makePath("collections/collection3", true);
-      }
+      Thread thread2 = new Thread() {
+        public void run() {
+          try {
+
+            zkClient.makePath("collections/collection3", true);
+
+          } catch (KeeperException e) {
+            throw new RuntimeException(e);
+          } catch (InterruptedException e) {
+            throw new RuntimeException(e);
+          }
+        }
+      };
+
+      thread2.start();
+
+      thread.join();
+      
+      thread2.join();
 
       assertNotNull(zkClient.exists("/collections/collection3", null, true));
       assertNotNull(zkClient.exists("/collections/collection1", null, true));
@@ -179,9 +191,6 @@ public class ZkSolrClientTest extends SolrTestCaseJ4 {
 
     } finally {
 
-      if (zkClient != null) {
-        zkClient.close();
-      }
       if (server != null) {
         server.shutdown();
       }
@@ -195,8 +204,6 @@ public class ZkSolrClientTest extends SolrTestCaseJ4 {
     try {
       server = new ZkTestServer(zkDir);
       server.run();
-      AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
-      AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
 
       final int timeout = random().nextInt(10000) + 5000;
       

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/api/collections/AssignTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/AssignTest.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/AssignTest.java
index d5197ca..638496a 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/AssignTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/AssignTest.java
@@ -16,7 +16,13 @@
  */
 package org.apache.solr.cloud.api.collections;
 
-import java.io.IOException;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.anyBoolean;
+import static org.mockito.ArgumentMatchers.anyInt;
+import static org.mockito.ArgumentMatchers.anyString;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
@@ -42,18 +48,10 @@ import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.util.ExecutorUtil;
 import org.apache.solr.common.util.Utils;
-import org.apache.zookeeper.KeeperException;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
-import static org.mockito.ArgumentMatchers.any;
-import static org.mockito.ArgumentMatchers.anyBoolean;
-import static org.mockito.ArgumentMatchers.anyInt;
-import static org.mockito.ArgumentMatchers.anyString;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.when;
-
 public class AssignTest extends SolrTestCaseJ4 {
   
   @Override
@@ -109,14 +107,13 @@ public class AssignTest extends SolrTestCaseJ4 {
 
       try (SolrZkClient zkClient = new SolrZkClient(server.getZkAddress(), 10000)) {
         assertTrue(zkClient.isConnected());
-        zkClient.makePath("/", true);
         for (String c : collections) {
-          zkClient.makePath("/collections/"+c, true);
+          zkClient.makePath("/collections/" + c, true);
         }
         // TODO: fix this to be independent of ZK
         ZkDistribStateManager stateManager = new ZkDistribStateManager(zkClient);
         List<Future<?>> futures = new ArrayList<>();
-        for (int i = 0; i < 1000; i++) {
+        for (int i = 0; i < 73; i++) {
           futures.add(executor.submit(() -> {
             String collection = collections[random().nextInt(collections.length)];
             int id = Assign.incAndGetId(stateManager, collection, 0);
@@ -130,7 +127,7 @@ public class AssignTest extends SolrTestCaseJ4 {
           future.get();
         }
       }
-      assertEquals(1000, (long) collectionUniqueIds.values().stream()
+      assertEquals(73, (long) collectionUniqueIds.values().stream()
           .map(ConcurrentHashMap::size)
           .reduce((m1, m2) -> m1 + m2).get());
     } finally {
@@ -141,12 +138,11 @@ public class AssignTest extends SolrTestCaseJ4 {
 
 
   @Test
-  public void testBuildCoreName() throws IOException, InterruptedException, KeeperException {
+  public void testBuildCoreName() throws Exception {
     String zkDir = createTempDir("zkData").toFile().getAbsolutePath();
     ZkTestServer server = new ZkTestServer(zkDir);
     server.run();
     try (SolrZkClient zkClient = new SolrZkClient(server.getZkAddress(), 10000)) {
-      zkClient.makePath("/", true);
       // TODO: fix this to be independent of ZK
       ZkDistribStateManager stateManager = new ZkDistribStateManager(zkClient);
       Map<String, Slice> slices = new HashMap<>();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIAsyncDistributedZkTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIAsyncDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIAsyncDistributedZkTest.java
index 7e939a0..b81b956 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIAsyncDistributedZkTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIAsyncDistributedZkTest.java
@@ -24,6 +24,7 @@ import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
+
 import org.apache.lucene.util.LuceneTestCase.Slow;
 import org.apache.lucene.util.TestUtil;
 import org.apache.solr.client.solrj.SolrClient;
@@ -39,9 +40,11 @@ import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.ExecutorUtil;
 import org.apache.solr.util.DefaultSolrThreadFactory;
-import org.junit.BeforeClass;
+import org.junit.After;
+import org.junit.Before;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -56,12 +59,19 @@ public class CollectionsAPIAsyncDistributedZkTest extends SolrCloudTestCase {
   
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
-  @BeforeClass
-  public static void setupCluster() throws Exception {
+  @Before
+  public void setupCluster() throws Exception {
+    // we recreate per test - they need to be isolated to be solid
     configureCluster(2)
         .addConfig("conf1", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf"))
         .configure();
   }
+  
+  @After
+  public void tearDown() throws Exception {
+    super.tearDown();
+    shutdownCluster();
+  }
 
   @Test
   public void testSolrJAPICalls() throws Exception {
@@ -88,10 +98,14 @@ public class CollectionsAPIAsyncDistributedZkTest extends SolrCloudTestCase {
   }
 
   @Test
-  //commented 9-Aug-2018  @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 21-May-2018
-  @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // added 20-Sep-2018
   public void testAsyncRequests() throws Exception {
-
+    boolean legacy = random().nextBoolean();
+    if (legacy) {
+      CollectionAdminRequest.setClusterProperty(ZkStateReader.LEGACY_CLOUD, "true").process(cluster.getSolrClient());
+    } else {
+      CollectionAdminRequest.setClusterProperty(ZkStateReader.LEGACY_CLOUD, "false").process(cluster.getSolrClient());
+    }
+    
     final String collection = "testAsyncOperations";
     final CloudSolrClient client = cluster.getSolrClient();
 
@@ -101,6 +115,9 @@ public class CollectionsAPIAsyncDistributedZkTest extends SolrCloudTestCase {
         .processAndWait(client, MAX_TIMEOUT_SECONDS);
     assertSame("CreateCollection task did not complete!", RequestStatusState.COMPLETED, state);
 
+    
+    cluster.waitForActiveCollection(collection, 1, 1);
+    
     //Add a few documents to shard1
     int numDocs = TestUtil.nextInt(random(), 10, 100);
     List<SolrInputDocument> docs = new ArrayList<>(numDocs);
@@ -125,6 +142,8 @@ public class CollectionsAPIAsyncDistributedZkTest extends SolrCloudTestCase {
         .processAndWait(client, MAX_TIMEOUT_SECONDS);
     assertSame("CreateShard did not complete", RequestStatusState.COMPLETED, state);
 
+    client.getZkStateReader().forceUpdateCollection(collection);
+    
     //Add a doc to shard2 to make sure shard2 was created properly
     SolrInputDocument doc = new SolrInputDocument();
     doc.addField("id", numDocs + 1);
@@ -143,14 +162,20 @@ public class CollectionsAPIAsyncDistributedZkTest extends SolrCloudTestCase {
     assertSame("AddReplica did not complete", RequestStatusState.COMPLETED, state);
 
     //cloudClient watch might take a couple of seconds to reflect it
-    Slice shard1 = client.getZkStateReader().getClusterState().getCollection(collection).getSlice("shard1");
-    int count = 0;
-    while (shard1.getReplicas().size() != 2) {
-      if (count++ > 1000) {
-        fail("2nd Replica not reflecting in the cluster state");
+    client.getZkStateReader().waitForState(collection, 20, TimeUnit.SECONDS, (n, c) -> {
+      if (c == null)
+        return false;
+      Slice slice = c.getSlice("shard1");
+      if (slice == null) {
+        return false;
       }
-      Thread.sleep(100);
-    }
+
+      if (slice.getReplicas().size() == 2) {
+        return true;
+      }
+
+      return false;
+    });
 
     state = CollectionAdminRequest.createAlias("myalias",collection)
         .processAndWait(client, MAX_TIMEOUT_SECONDS);
@@ -170,7 +195,8 @@ public class CollectionsAPIAsyncDistributedZkTest extends SolrCloudTestCase {
     } catch (SolrException e) {
       //expected
     }
-
+    
+    Slice shard1 = client.getZkStateReader().getClusterState().getCollection(collection).getSlice("shard1");
     Replica replica = shard1.getReplicas().iterator().next();
     for (String liveNode : client.getZkStateReader().getClusterState().getLiveNodes()) {
       if (!replica.getNodeName().equals(liveNode)) {
@@ -180,20 +206,23 @@ public class CollectionsAPIAsyncDistributedZkTest extends SolrCloudTestCase {
         break;
       }
     }
-
+    client.getZkStateReader().forceUpdateCollection(collection);
+    
     shard1 = client.getZkStateReader().getClusterState().getCollection(collection).getSlice("shard1");
     String replicaName = shard1.getReplicas().iterator().next().getName();
     state = CollectionAdminRequest.deleteReplica(collection, "shard1", replicaName)
       .processAndWait(client, MAX_TIMEOUT_SECONDS);
     assertSame("DeleteReplica did not complete", RequestStatusState.COMPLETED, state);
 
-    state = CollectionAdminRequest.deleteCollection(collection)
-        .processAndWait(client, MAX_TIMEOUT_SECONDS);
-    assertSame("DeleteCollection did not complete", RequestStatusState.COMPLETED, state);
+    if (!legacy) {
+      state = CollectionAdminRequest.deleteCollection(collection)
+          .processAndWait(client, MAX_TIMEOUT_SECONDS);
+      assertSame("DeleteCollection did not complete", RequestStatusState.COMPLETED, state);
+    }
   }
-  // commented 4-Sep-2018  @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2-Aug-2018
-  @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 14-Oct-2018
+
   public void testAsyncIdRaceCondition() throws Exception {
+
     SolrClient[] clients = new SolrClient[cluster.getJettySolrRunners().size()];
     int j = 0;
     for (JettySolrRunner r:cluster.getJettySolrRunners()) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIDistributedZkTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIDistributedZkTest.java
index e1d4344..d019dd8 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIDistributedZkTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIDistributedZkTest.java
@@ -16,9 +16,9 @@
  */
 package org.apache.solr.cloud.api.collections;
 
-import javax.management.MBeanServer;
-import javax.management.MBeanServerFactory;
-import javax.management.ObjectName;
+import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP;
+import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR;
+
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.lang.management.ManagementFactory;
@@ -38,7 +38,10 @@ import java.util.Optional;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 
-import com.google.common.collect.ImmutableList;
+import javax.management.MBeanServer;
+import javax.management.MBeanServerFactory;
+import javax.management.ObjectName;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.lucene.util.LuceneTestCase.Slow;
 import org.apache.lucene.util.TestUtil;
@@ -75,14 +78,13 @@ import org.apache.solr.core.SolrInfoBean.Category;
 import org.apache.solr.util.LogLevel;
 import org.apache.solr.util.TestInjection;
 import org.apache.solr.util.TimeOut;
+import org.junit.After;
 import org.junit.Before;
-import org.junit.BeforeClass;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP;
-import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR;
+import com.google.common.collect.ImmutableList;
 
 /**
  * Tests the Cloud Collections API.
@@ -91,16 +93,14 @@ import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR;
 public class CollectionsAPIDistributedZkTest extends SolrCloudTestCase {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
-  @BeforeClass
-  public static void beforeCollectionsAPIDistributedZkTest() {
+  @Before
+  public void setupCluster() throws Exception {
     // we don't want this test to have zk timeouts
-    System.setProperty("zkClientTimeout", "240000");
-    TestInjection.randomDelayInCoreCreation = "true:20";
+    System.setProperty("zkClientTimeout", "60000");
+    System.setProperty("createCollectionWaitTimeTillActive", "5");
+    TestInjection.randomDelayInCoreCreation = "true:5";
     System.setProperty("validateAfterInactivity", "200");
-  }
-
-  @BeforeClass
-  public static void setupCluster() throws Exception {
+    
     String solrXml = IOUtils.toString(CollectionsAPIDistributedZkTest.class.getResourceAsStream("/solr/solr-jmxreporter.xml"), "UTF-8");
     configureCluster(4)
         .addConfig("conf", configset("cloud-minimal"))
@@ -108,14 +108,11 @@ public class CollectionsAPIDistributedZkTest extends SolrCloudTestCase {
         .withSolrXml(solrXml)
         .configure();
   }
-
-  @Before
-  public void clearCluster() throws Exception {
-    try {
-      cluster.deleteAllCollections();
-    } finally {
-      System.clearProperty("zkClientTimeout");
-    }
+  
+  @After
+  public void tearDownCluster() throws Exception {
+    shutdownCluster();
+    System.clearProperty("createCollectionWaitTimeTillActive");
   }
 
   @Test
@@ -428,6 +425,14 @@ public class CollectionsAPIDistributedZkTest extends SolrCloudTestCase {
     // create new collections rapid fire
     int cnt = random().nextInt(TEST_NIGHTLY ? 3 : 1) + 1;
     CollectionAdminRequest.Create[] createRequests = new CollectionAdminRequest.Create[cnt];
+    
+    class Coll {
+      String name;
+      int numShards;
+      int replicationFactor;
+    }
+    
+    List<Coll> colls = new ArrayList<>();
 
     for (int i = 0; i < cnt; i++) {
 
@@ -439,25 +444,30 @@ public class CollectionsAPIDistributedZkTest extends SolrCloudTestCase {
           = CollectionAdminRequest.createCollection("awhollynewcollection_" + i, "conf2", numShards, replicationFactor)
           .setMaxShardsPerNode(maxShardsPerNode);
       createRequests[i].processAsync(cluster.getSolrClient());
+      
+      Coll coll = new Coll();
+      coll.name = "awhollynewcollection_" + i;
+      coll.numShards = numShards;
+      coll.replicationFactor = replicationFactor;
+      colls.add(coll);
     }
 
-    for (int i = 0; i < cnt; i++) {
-      String collectionName = "awhollynewcollection_" + i;
-      final int j = i;
-      waitForState("Expected to see collection " + collectionName, collectionName,
-          (n, c) -> {
-            CollectionAdminRequest.Create req = createRequests[j];
-            return DocCollection.isFullyActive(n, c, req.getNumShards(), req.getReplicationFactor());
-          });
+    for (Coll coll : colls) {
+      cluster.waitForActiveCollection(coll.name, coll.numShards, coll.numShards * coll.replicationFactor);
     }
 
-    cluster.injectChaos(random());
+    waitForStable(cnt, createRequests);
 
     for (int i = 0; i < cluster.getJettySolrRunners().size(); i++) {
       checkInstanceDirs(cluster.getJettySolrRunner(i));
     }
-
+    
     String collectionName = createRequests[random().nextInt(createRequests.length)].getCollectionName();
+    
+    // TODO: we should not need this...beast test well when trying to fix
+    Thread.sleep(1000);
+    
+    cluster.getSolrClient().getZkStateReader().forciblyRefreshAllClusterStateSlow();
 
     new UpdateRequest()
         .add("id", "6")
@@ -483,6 +493,25 @@ public class CollectionsAPIDistributedZkTest extends SolrCloudTestCase {
     checkNoTwoShardsUseTheSameIndexDir();
   }
 
+  private void waitForStable(int cnt, CollectionAdminRequest.Create[] createRequests) throws InterruptedException {
+    for (int i = 0; i < cnt; i++) {
+      String collectionName = "awhollynewcollection_" + i;
+      final int j = i;
+      waitForState("Expected to see collection " + collectionName, collectionName,
+          (n, c) -> {
+            CollectionAdminRequest.Create req = createRequests[j];
+            return DocCollection.isFullyActive(n, c, req.getNumShards(), req.getReplicationFactor());
+          });
+      
+      ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
+      // make sure we have leaders for each shard
+      for (int z = 1; z < createRequests[j].getNumShards(); z++) {
+        zkStateReader.getLeaderRetry(collectionName, "shard" + z, 10000);
+      }      // make sure we again have leaders for each shard
+      
+    }
+  }
+
   @Test
   public void testCollectionReload() throws Exception {
 
@@ -621,6 +650,7 @@ public class CollectionsAPIDistributedZkTest extends SolrCloudTestCase {
     CollectionAdminRequest.createCollection(collectionName, "conf", 2, 2)
         .setMaxShardsPerNode(4)
         .process(cluster.getSolrClient());
+    cluster.waitForActiveCollection(collectionName, 2, 4);
 
     ArrayList<String> nodeList
         = new ArrayList<>(cluster.getSolrClient().getZkStateReader().getClusterState().getLiveNodes());

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/api/collections/HdfsCollectionsAPIDistributedZkTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/HdfsCollectionsAPIDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/HdfsCollectionsAPIDistributedZkTest.java
index ed962ec..20706ef 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/HdfsCollectionsAPIDistributedZkTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/HdfsCollectionsAPIDistributedZkTest.java
@@ -84,7 +84,6 @@ public class HdfsCollectionsAPIDistributedZkTest extends CollectionsAPIDistribut
 
   @Test
   public void moveReplicaTest() throws Exception {
-    cluster.waitForAllNodes(5000);
     String coll = "movereplicatest_coll";
 
     CloudSolrClient cloudClient = cluster.getSolrClient();
@@ -130,7 +129,7 @@ public class HdfsCollectionsAPIDistributedZkTest extends CollectionsAPIDistribut
     checkNumOfCores(cloudClient, replica.getNodeName(), 0);
     checkNumOfCores(cloudClient, targetNode, 2);
 
-    waitForState("Wait for recovery finish failed",coll, clusterShape(2,2));
+    waitForState("Wait for recovery finish failed",coll, clusterShape(2,4));
     slice = cloudClient.getZkStateReader().getClusterState().getCollection(coll).getSlice(slice.getName());
     boolean found = false;
     for (Replica newReplica : slice.getReplicas()) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/api/collections/ShardSplitTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/ShardSplitTest.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/ShardSplitTest.java
index 0b474e5..6098ed8 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/ShardSplitTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/ShardSplitTest.java
@@ -28,6 +28,7 @@ import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
@@ -45,8 +46,8 @@ import org.apache.solr.client.solrj.response.CollectionAdminResponse;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.client.solrj.response.RequestStatusState;
 import org.apache.solr.cloud.AbstractDistribZkTestBase;
-import org.apache.solr.cloud.AbstractFullDistribZkTestBase;
-import org.apache.solr.cloud.ChaosMonkey;
+import org.apache.solr.cloud.BasicDistributedZkTest;
+import org.apache.solr.cloud.SolrCloudTestCase;
 import org.apache.solr.cloud.StoppableIndexingThread;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.cloud.ClusterState;
@@ -78,7 +79,7 @@ import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR;
 
 @Slow
 @LogLevel("org.apache.solr.cloud.Overseer=DEBUG;org.apache.solr.cloud.overseer=DEBUG;org.apache.solr.cloud.api.collections=DEBUG;org.apache.solr.cloud.OverseerTaskProcessor=DEBUG;org.apache.solr.util.TestInjection=DEBUG")
-public class ShardSplitTest extends AbstractFullDistribZkTestBase {
+public class ShardSplitTest extends BasicDistributedZkTest {
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
@@ -96,7 +97,7 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
   }
 
   @Test
-  // 12-Jun-2018 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028")
+  @Nightly
   public void test() throws Exception {
 
     waitForThingsToLevelOut(15);
@@ -143,6 +144,9 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
     create.setMaxShardsPerNode(5); // some high number so we can create replicas without hindrance
     create.setCreateNodeSet(nodeName); // we want to create the leader on a fixed node so that we know which one to restart later
     create.process(cloudClient);
+    
+    cloudClient.waitForState(collectionName, 30, TimeUnit.SECONDS, SolrCloudTestCase.activeClusterShape(1, 1));
+    
     try (CloudSolrClient client = getCloudSolrClient(zkServer.getZkAddress(), true, cloudClient.getLbClient().getHttpClient())) {
       client.setDefaultCollection(collectionName);
       StoppableIndexingThread thread = new StoppableIndexingThread(controlClient, client, "i1", true);
@@ -185,12 +189,14 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
           int liveNodeCount = client.getZkStateReader().getClusterState().getLiveNodes().size();
 
           // restart the sub-shard leader node
+          String stoppedNodeName = null;
           boolean restarted = false;
           for (JettySolrRunner jetty : jettys) {
             int port = jetty.getBaseUrl().getPort();
             if (replica.getStr(BASE_URL_PROP).contains(":" + port))  {
-              ChaosMonkey.kill(jetty);
-              ChaosMonkey.start(jetty);
+              stoppedNodeName = jetty.getNodeName();
+              jetty.stop();
+              jetty.start();
               restarted = true;
               break;
             }
@@ -199,6 +205,8 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
             // sanity check
             fail("We could not find a jetty to kill for replica: " + replica.getCoreUrl());
           }
+          
+          cloudClient.getZkStateReader().waitForLiveNodes(30, TimeUnit.SECONDS, SolrCloudTestCase.containsLiveNode(stoppedNodeName));
 
           // add a new replica for the sub-shard
           CollectionAdminRequest.AddReplica addReplica = CollectionAdminRequest.addReplicaToShard(collectionName, SHARD1_0);
@@ -208,6 +216,9 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
           try (HttpSolrClient control = new HttpSolrClient.Builder(control_collection).withHttpClient(client.getLbClient().getHttpClient()).build())  {
             state = addReplica.processAndWait(control, 30);
           }
+          
+          cloudClient.waitForState(collectionName, 30, TimeUnit.SECONDS, SolrCloudTestCase.activeClusterShape(2, 4));
+          
           if (state == RequestStatusState.COMPLETED)  {
             CountDownLatch newReplicaLatch = new CountDownLatch(1);
             client.getZkStateReader().registerCollectionStateWatcher(collectionName, (liveNodes, collectionState) -> {
@@ -319,6 +330,7 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
   }
 
   @Test
+  @Nightly
   public void testSplitAfterFailedSplit2() throws Exception {
     waitForThingsToLevelOut(15);
 
@@ -345,9 +357,12 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
   private void doSplitMixedReplicaTypes(SolrIndexSplitter.SplitMethod splitMethod) throws Exception {
     waitForThingsToLevelOut(15);
     String collectionName = "testSplitMixedReplicaTypes_" + splitMethod.toLower();
-    CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionName, "conf1", 1, 2, 2, 2);
+    CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionName, "conf1", 1, 2, 0, 2); // TODO tlog replicas disabled right now.
     create.setMaxShardsPerNode(5); // some high number so we can create replicas without hindrance
     create.process(cloudClient);
+    
+    cloudClient.waitForState(collectionName, 30, TimeUnit.SECONDS, SolrCloudTestCase.activeClusterShape(1, 4));
+    
     waitForRecoveriesToFinish(collectionName, false);
 
     for (int i = 0; i < 100; i++) {
@@ -360,6 +375,8 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
     splitShard.setSplitMethod(splitMethod.toLower());
     CollectionAdminResponse rsp = splitShard.process(cloudClient);
     waitForThingsToLevelOut(30);
+   
+    cloudClient.waitForState(collectionName, 30, TimeUnit.SECONDS, SolrCloudTestCase.activeClusterShape(2, 12));
 
     cloudClient.getZkStateReader().forceUpdateCollection(collectionName);
     ClusterState clusterState = cloudClient.getZkStateReader().getClusterState();
@@ -367,10 +384,10 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
     log.info("coll: " + coll);
 
     // verify the original shard
-    verifyShard(coll, SHARD1, Slice.State.INACTIVE, 2, 2, 2);
+    verifyShard(coll, SHARD1, Slice.State.INACTIVE, 2, 0, 2);
     // verify new sub-shards
-    verifyShard(coll, SHARD1_0, Slice.State.ACTIVE, 2, 2, 2);
-    verifyShard(coll, SHARD1_1, Slice.State.ACTIVE, 2, 2, 2);
+    verifyShard(coll, SHARD1_0, Slice.State.ACTIVE, 2, 0, 2);
+    verifyShard(coll, SHARD1_1, Slice.State.ACTIVE, 2, 0, 2);
   }
 
   private void verifyShard(DocCollection coll, String shard, Slice.State expectedState, int numNrt, int numTlog, int numPull) throws Exception {
@@ -392,6 +409,7 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
   }
 
   @Test
+  @Nightly
   public void testSplitWithChaosMonkey() throws Exception {
     waitForThingsToLevelOut(15);
 
@@ -435,7 +453,7 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
             CloudJettyRunner cjetty = shardToLeaderJetty.get(SHARD1);
             try {
               Thread.sleep(1000 + random().nextInt(500));
-              ChaosMonkey.kill(cjetty);
+              cjetty.jetty.stop();
               stop.set(true);
               return true;
             } catch (Exception e) {
@@ -478,7 +496,7 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
 
       CloudJettyRunner cjetty = shardToLeaderJetty.get(SHARD1);
       log.info("Starting shard1 leader jetty at port {}", cjetty.jetty.getLocalPort());
-      ChaosMonkey.start(cjetty.jetty);
+      cjetty.jetty.start();
       cloudClient.getZkStateReader().forceUpdateCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
       log.info("Current collection state: {}", printClusterStateInfo(AbstractDistribZkTestBase.DEFAULT_COLLECTION));
 
@@ -551,6 +569,9 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
     CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionName, "conf1", 1, 2);
     create.setMaxShardsPerNode(5); // some high number so we can create replicas without hindrance
     create.process(cloudClient);
+    
+    cloudClient.waitForState(collectionName, 30, TimeUnit.SECONDS, SolrCloudTestCase.activeClusterShape(1, 2));
+    
     waitForRecoveriesToFinish(collectionName, false);
 
     TestInjection.splitLatch = new CountDownLatch(1); // simulate a long split operation
@@ -625,8 +646,15 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
     String collectionName = "shardSplitWithRule_" + splitMethod.toLower();
     CollectionAdminRequest.Create createRequest = CollectionAdminRequest.createCollection(collectionName, "conf1", 1, 2)
         .setRule("shard:*,replica:<2,node:*");
+
     CollectionAdminResponse response = createRequest.process(cloudClient);
     assertEquals(0, response.getStatus());
+    
+    try {
+      cloudClient.waitForState(collectionName, 30, TimeUnit.SECONDS, SolrCloudTestCase.activeClusterShape(1, 2));
+    } catch (TimeoutException e) {
+      new RuntimeException("Timeout waiting for 1shards and 2 replicas.", e);
+    }
 
     CollectionAdminRequest.SplitShard splitShardRequest = CollectionAdminRequest.splitShard(collectionName)
         .setShardName("shard1").setSplitMethod(splitMethod.toLower());
@@ -784,7 +812,7 @@ public class ShardSplitTest extends AbstractFullDistribZkTestBase {
           OverseerCollectionMessageHandler.NUM_SLICES, numShards,
           "router.field", shard_fld);
 
-      createCollection(collectionInfos, collectionName,props,client);
+      createCollection(collectionInfos, collectionName, props, client);
     }
 
     List<Integer> list = collectionInfos.get(collectionName);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/api/collections/SimpleCollectionCreateDeleteTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/SimpleCollectionCreateDeleteTest.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/SimpleCollectionCreateDeleteTest.java
index 0b75bd5..971bb81 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/SimpleCollectionCreateDeleteTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/SimpleCollectionCreateDeleteTest.java
@@ -16,11 +16,20 @@
  */
 package org.apache.solr.cloud.api.collections;
 
+import java.util.Collection;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import org.apache.solr.client.solrj.embedded.JettySolrRunner;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.cloud.AbstractFullDistribZkTestBase;
 import org.apache.solr.cloud.OverseerCollectionConfigSetProcessor;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.TimeSource;
+import org.apache.solr.core.CoreDescriptor;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.util.TimeOut;
 import org.junit.Test;
 
 public class SimpleCollectionCreateDeleteTest extends AbstractFullDistribZkTestBase {
@@ -54,6 +63,32 @@ public class SimpleCollectionCreateDeleteTest extends AbstractFullDistribZkTestB
       cloudClient.request(delete);
 
       assertFalse(cloudClient.getZkStateReader().getZkClient().exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collectionName, false));
+      
+      // currently, removing a collection does not wait for cores to be unloaded
+      TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+      while (true) {
+        
+        if( timeout.hasTimedOut() ) {
+          throw new TimeoutException("Timed out waiting for all collections to be fully removed.");
+        }
+        
+        boolean allContainersEmpty = true;
+        for(JettySolrRunner jetty : jettys) {
+          
+          Collection<SolrCore> cores = jetty.getCoreContainer().getCores();
+          for (SolrCore core : cores) {
+            CoreDescriptor cd = core.getCoreDescriptor();
+            if (cd != null) {
+              if (cd.getCloudDescriptor().getCollectionName().equals(collectionName)) {
+                allContainersEmpty = false;
+              }
+            }
+          }
+        }
+        if (allContainersEmpty) {
+          break;
+        }
+      }
 
       // create collection again on a node other than the overseer leader
       create = CollectionAdminRequest.createCollection(collectionName,1,1)

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/api/collections/TestCollectionsAPIViaSolrCloudCluster.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestCollectionsAPIViaSolrCloudCluster.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestCollectionsAPIViaSolrCloudCluster.java
index 6ee616f..34355b7 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestCollectionsAPIViaSolrCloudCluster.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestCollectionsAPIViaSolrCloudCluster.java
@@ -88,13 +88,17 @@ public class TestCollectionsAPIViaSolrCloudCluster extends SolrCloudTestCase {
           .setCreateNodeSet(createNodeSet)
           .setProperties(collectionProperties)
           .process(cluster.getSolrClient());
+
+    }
+    
+    if (createNodeSet != null && createNodeSet.equals(OverseerCollectionMessageHandler.CREATE_NODE_SET_EMPTY)) {
+      cluster.waitForActiveCollection(collectionName, numShards, 0);
+    } else {
+      cluster.waitForActiveCollection(collectionName, numShards, numShards * numReplicas);
     }
-    AbstractDistribZkTestBase.waitForRecoveriesToFinish
-        (collectionName, cluster.getSolrClient().getZkStateReader(), true, true, 330);
   }
 
   @Test
-  @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028")
   public void testCollectionCreateSearchDelete() throws Exception {
     final CloudSolrClient client = cluster.getSolrClient();
     final String collectionName = "testcollection";
@@ -108,11 +112,15 @@ public class TestCollectionsAPIViaSolrCloudCluster extends SolrCloudTestCase {
 
     // shut down a server
     JettySolrRunner stoppedServer = cluster.stopJettySolrRunner(0);
+    
+    cluster.waitForJettyToStop(stoppedServer);
+    
     assertTrue(stoppedServer.isStopped());
     assertEquals(nodeCount - 1, cluster.getJettySolrRunners().size());
 
     // create a server
     JettySolrRunner startedServer = cluster.startJettySolrRunner();
+    cluster.waitForAllNodes(30);
     assertTrue(startedServer.isRunning());
     assertEquals(nodeCount, cluster.getJettySolrRunners().size());
 
@@ -153,6 +161,7 @@ public class TestCollectionsAPIViaSolrCloudCluster extends SolrCloudTestCase {
 
     // re-create a server (to restore original nodeCount count)
     startedServer = cluster.startJettySolrRunner(jettyToStop);
+    cluster.waitForAllNodes(30);
     assertTrue(startedServer.isRunning());
     assertEquals(nodeCount, cluster.getJettySolrRunners().size());
 
@@ -162,6 +171,8 @@ public class TestCollectionsAPIViaSolrCloudCluster extends SolrCloudTestCase {
 
     // create it again
     createCollection(collectionName, null);
+    
+    cluster.waitForActiveCollection(collectionName, numShards, numShards * numReplicas);
 
     // check that there's no left-over state
     assertEquals(0, client.query(collectionName, new SolrQuery("*:*")).getResults().getNumFound());
@@ -289,7 +300,8 @@ public class TestCollectionsAPIViaSolrCloudCluster extends SolrCloudTestCase {
         assertTrue(jetty.isRunning());
       }
     }
-    AbstractDistribZkTestBase.waitForRecoveriesToFinish(collectionName, zkStateReader, true, true, 330);
+    cluster.waitForAllNodes(30);
+    cluster.waitForActiveCollection(collectionName, numShards, numShards * numReplicas);
 
     zkStateReader.forceUpdateCollection(collectionName);
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/api/collections/TestHdfsCloudBackupRestore.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestHdfsCloudBackupRestore.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestHdfsCloudBackupRestore.java
index e81bc4b..4d9a30c 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestHdfsCloudBackupRestore.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestHdfsCloudBackupRestore.java
@@ -26,6 +26,8 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.Properties;
 
+import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
+
 import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
@@ -61,8 +63,7 @@ import static org.apache.solr.core.backup.BackupManager.ZK_STATE_DIR;
 @ThreadLeakFilters(defaultFilters = true, filters = {
     BadHdfsThreadsFilter.class // hdfs currently leaks thread(s)
 })
-//05-Jul-2018  @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 04-May-2018
-//commented 23-AUG-2018  @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2-Aug-2018
+@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12866")
 public class TestHdfsCloudBackupRestore extends AbstractCloudBackupRestoreTestCase {
   public static final String SOLR_XML = "<solr>\n" +
       "\n" +

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/api/collections/TestLocalFSCloudBackupRestore.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestLocalFSCloudBackupRestore.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestLocalFSCloudBackupRestore.java
index 83a6947..e697889 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestLocalFSCloudBackupRestore.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestLocalFSCloudBackupRestore.java
@@ -16,15 +16,16 @@
  */
 package org.apache.solr.cloud.api.collections;
 
+import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 /**
- * This class implements the tests for local file-system integration for Solr backup/restore capability.
- * Note that the Solr backup/restore still requires a "shared" file-system. Its just that in this case
- * such file-system would be exposed via local file-system API.
+ * This class implements the tests for local file-system integration for Solr backup/restore capability. Note that the
+ * Solr backup/restore still requires a "shared" file-system. Its just that in this case such file-system would be
+ * exposed via local file-system API.
  */
-//commented 9-Aug-2018 @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2-Aug-2018
+@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12866")
 public class TestLocalFSCloudBackupRestore extends AbstractCloudBackupRestoreTestCase {
   private static String backupLocation;
 
@@ -59,8 +60,7 @@ public class TestLocalFSCloudBackupRestore extends AbstractCloudBackupRestoreTes
 
   @Override
   @Test
-  //Commented 14-Oct-2018 @BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12028") // added 09-Aug-2018
   public void test() throws Exception {
     super.test();
   }
-  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasIntegrationTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasIntegrationTest.java
index 3c40d8b..5ad5764 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasIntegrationTest.java
@@ -17,6 +17,8 @@
 
 package org.apache.solr.cloud.autoscaling;
 
+import static org.apache.solr.common.util.Utils.makeMap;
+
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
@@ -41,18 +43,17 @@ import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.TimeSource;
 import org.apache.solr.util.LogLevel;
 import org.apache.solr.util.TimeOut;
-import org.junit.BeforeClass;
+import org.junit.After;
+import org.junit.Before;
 import org.junit.Test;
 
-import static org.apache.solr.common.util.Utils.makeMap;
-
 @LogLevel("org.apache.solr.cloud.autoscaling=DEBUG;org.apache.solr.client.solrj.cloud.autoscaling=DEBUG;org.apache.solr.cloud=DEBUG;org.apache.solr.cloud.Overseer=DEBUG;org.apache.solr.cloud.overseer=DEBUG;")
 public class AutoAddReplicasIntegrationTest extends SolrCloudTestCase {
   private static final String COLLECTION1 =  "testSimple1";
   private static final String COLLECTION2 =  "testSimple2";
 
-  @BeforeClass
-  public static void setupCluster() throws Exception {
+  @Before
+  public void setupCluster() throws Exception {
     configureCluster(3)
         .addConfig("conf", configset("cloud-minimal"))
         .withSolrXml(TEST_PATH().resolve("solr.xml"))
@@ -64,11 +65,15 @@ public class AutoAddReplicasIntegrationTest extends SolrCloudTestCase {
         .build()
         .process(cluster.getSolrClient());
   }
+  
+  @After
+  public void tearDown() throws Exception {
+    shutdownCluster();
+    super.tearDown();
+  }
 
   @Test
   // This apparently fails in both subclasses.
-  // 12-Jun-2018 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028")
-  // commented 15-Sep-2018 @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2-Aug-2018
   public void testSimple() throws Exception {
     JettySolrRunner jetty1 = cluster.getJettySolrRunner(0);
     JettySolrRunner jetty2 = cluster.getJettySolrRunner(1);
@@ -97,25 +102,36 @@ public class AutoAddReplicasIntegrationTest extends SolrCloudTestCase {
     String lostNodeName = lostJetty.getNodeName();
     List<Replica> replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION1, zkStateReader, lostNodeName);
     lostJetty.stop();
+    
+    cluster.waitForJettyToStop(lostJetty);
+    
     waitForNodeLeave(lostNodeName);
+    
     // ensure that 2 shards have 2 active replicas and only 4 replicas in total
     // i.e. old replicas have been deleted.
     // todo remove the condition for total replicas == 4 after SOLR-11591 is fixed
-    waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, (liveNodes, collectionState) -> clusterShape(2, 2).matches(liveNodes, collectionState)
-        && collectionState.getReplicas().size() == 4);
+    waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, (liveNodes, collectionState) -> clusterShape(2, 4).matches(liveNodes, collectionState)
+        && collectionState.getReplicas().size() == 4, 90, TimeUnit.SECONDS);
     checkSharedFsReplicasMovedCorrectly(replacedHdfsReplicas, zkStateReader, COLLECTION1);
     lostJetty.start();
+    
+    cluster.waitForAllNodes(30);
+    
     assertTrue("Timeout waiting for all live and active", ClusterStateUtil.waitForAllActiveAndLiveReplicas(cluster.getSolrClient().getZkStateReader(), 90000));
 
     // check cluster property is considered
     disableAutoAddReplicasInCluster();
     lostNodeName = jetty3.getNodeName();
     jetty3.stop();
+    
+    cluster.waitForJettyToStop(jetty3);
+    
     waitForNodeLeave(lostNodeName);
-    waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, clusterShape(2, 1));
-    jetty3.start();
+    
     waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, clusterShape(2, 2));
-    waitForState("Waiting for collection " + COLLECTION2, COLLECTION2, clusterShape(2, 2));
+    jetty3.start();
+    waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, clusterShape(2, 4));
+    waitForState("Waiting for collection " + COLLECTION2, COLLECTION2, clusterShape(2, 4));
     enableAutoAddReplicasInCluster();
 
 
@@ -132,10 +148,14 @@ public class AutoAddReplicasIntegrationTest extends SolrCloudTestCase {
 
     lostNodeName = jetty2.getNodeName();
     replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION2, zkStateReader, lostNodeName);
+    
     jetty2.stop();
+    
+    cluster.waitForJettyToStop(jetty2);
+    
     waitForNodeLeave(lostNodeName);
-    waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, clusterShape(2, 2));
-    waitForState("Waiting for collection " + COLLECTION2, COLLECTION2, clusterShape(2, 2));
+    waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, clusterShape(2, 4), 45, TimeUnit.SECONDS);
+    waitForState("Waiting for collection " + COLLECTION2, COLLECTION2, clusterShape(2, 4), 45, TimeUnit.SECONDS);
     checkSharedFsReplicasMovedCorrectly(replacedHdfsReplicas, zkStateReader, COLLECTION2);
 
     // overseer failover test..

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75b18319/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasPlanActionTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasPlanActionTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasPlanActionTest.java
index 31bd2fd..1c6d4a8 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasPlanActionTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasPlanActionTest.java
@@ -17,38 +17,49 @@
 
 package org.apache.solr.cloud.autoscaling;
 
+import static org.apache.solr.cloud.autoscaling.AutoScalingHandlerTest.createAutoScalingRequest;
+
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Optional;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 
 import org.apache.solr.client.solrj.SolrRequest;
 import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
 import org.apache.solr.client.solrj.embedded.JettySolrRunner;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.client.solrj.request.V2Request;
 import org.apache.solr.cloud.CloudDescriptor;
 import org.apache.solr.cloud.SolrCloudTestCase;
+import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.ClusterStateUtil;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.CollectionParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.params.SolrParams;
-import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.NamedList; 
 import org.apache.solr.common.util.SuppressForbidden;
-import org.apache.solr.common.util.TimeSource;
-import org.apache.solr.util.TimeOut;
+import org.junit.After;
+import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import static org.apache.solr.cloud.autoscaling.AutoScalingHandlerTest.createAutoScalingRequest;
-
 public class AutoAddReplicasPlanActionTest extends SolrCloudTestCase{
-
+  
   @BeforeClass
   public static void setupCluster() throws Exception {
+    System.setProperty("solr.httpclient.retries", "4");
+    System.setProperty("solr.retries.on.forward", "1");
+    System.setProperty("solr.retries.to.followers", "1"); 
+
+  }
+  
+  @Before
+  public void beforeTest() throws Exception {
     configureCluster(3)
         .addConfig("conf", configset("cloud-minimal"))
         .configure();
@@ -59,6 +70,11 @@ public class AutoAddReplicasPlanActionTest extends SolrCloudTestCase{
         .build()
         .process(cluster.getSolrClient());
   }
+  
+  @After 
+  public void afterTest() throws Exception {
+    shutdownCluster();
+  }
 
   @Test
   @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028")
@@ -85,7 +101,11 @@ public class AutoAddReplicasPlanActionTest extends SolrCloudTestCase{
         .setAutoAddReplicas(false)
         .setMaxShardsPerNode(3)
         .process(cluster.getSolrClient());
-
+    
+    cluster.waitForActiveCollection(collection1, 2, 4);
+    cluster.waitForActiveCollection(collection2, 1, 2);
+    cluster.waitForActiveCollection("testSimple3", 3, 3);
+    
     // we remove the implicit created trigger, so the replicas won't be moved
     String removeTriggerCommand = "{" +
         "'remove-trigger' : {" +
@@ -102,34 +122,71 @@ public class AutoAddReplicasPlanActionTest extends SolrCloudTestCase{
     List<CloudDescriptor> cloudDescriptors = lostJetty.getCoreContainer().getCores().stream()
         .map(solrCore -> solrCore.getCoreDescriptor().getCloudDescriptor())
         .collect(Collectors.toList());
+    
+    ZkStateReader reader = cluster.getSolrClient().getZkStateReader();
+
     lostJetty.stop();
-    waitForNodeLeave(lostNodeName);
+    
+    cluster.waitForJettyToStop(lostJetty);
+
+    reader.waitForLiveNodes(30, TimeUnit.SECONDS, missingLiveNode(lostNodeName));
+
 
     List<SolrRequest> operations = getOperations(jetty3, lostNodeName);
     assertOperations(collection1, operations, lostNodeName, cloudDescriptors,  null);
 
     lostJetty.start();
-    ClusterStateUtil.waitForAllActiveAndLiveReplicas(cluster.getSolrClient().getZkStateReader(), 30000);
+    cluster.waitForAllNodes(30);
+    
+    cluster.waitForActiveCollection(collection1, 2, 4);
+    cluster.waitForActiveCollection(collection2, 1, 2);
+    cluster.waitForActiveCollection("testSimple3", 3, 3);
+    
+    assertTrue("Timeout waiting for all live and active", ClusterStateUtil.waitForAllActiveAndLiveReplicas(cluster.getSolrClient().getZkStateReader(), 30000));
+    
     String setClusterPreferencesCommand = "{" +
         "'set-cluster-preferences': [" +
         "{'minimize': 'cores','precision': 0}]" +
         "}";
     req = createAutoScalingRequest(SolrRequest.METHOD.POST, setClusterPreferencesCommand);
-    response = cluster.getSolrClient().request(req);
+    
+    // you can hit a stale connection from pool when restarting jetty
+    try (CloudSolrClient cloudClient = new CloudSolrClient.Builder(Collections.singletonList(cluster.getZkServer().getZkAddress()),
+        Optional.empty())
+            .withSocketTimeout(45000).withConnectionTimeout(15000).build()) {
+      response = cloudClient.request(req);
+    }
+
     assertEquals(response.get("result").toString(), "success");
 
     lostJetty = random().nextBoolean()? jetty1 : jetty2;
-    lostNodeName = lostJetty.getNodeName();
+    String lostNodeName2 = lostJetty.getNodeName();
     cloudDescriptors = lostJetty.getCoreContainer().getCores().stream()
         .map(solrCore -> solrCore.getCoreDescriptor().getCloudDescriptor())
         .collect(Collectors.toList());
+    
+
+    
     lostJetty.stop();
-    waitForNodeLeave(lostNodeName);
+   
+    reader.waitForLiveNodes(30, TimeUnit.SECONDS, missingLiveNode(lostNodeName2));
 
-    operations = getOperations(jetty3, lostNodeName);
-    assertOperations(collection1, operations, lostNodeName, cloudDescriptors, jetty3);
+    try {
+      operations = getOperations(jetty3, lostNodeName2);
+    } catch (SolrException e) {
+      // we might get a stale connection from the pool after jetty restarts
+      operations = getOperations(jetty3, lostNodeName2);
+    }
+    
+    assertOperations(collection1, operations, lostNodeName2, cloudDescriptors, jetty3);
 
     lostJetty.start();
+    cluster.waitForAllNodes(30);
+    
+    cluster.waitForActiveCollection(collection1, 2, 4);
+    cluster.waitForActiveCollection(collection2, 1, 2);
+    cluster.waitForActiveCollection("testSimple3", 3, 3);
+    
     assertTrue("Timeout waiting for all live and active", ClusterStateUtil.waitForAllActiveAndLiveReplicas(cluster.getSolrClient().getZkStateReader(), 30000));
 
     new CollectionAdminRequest.AsyncCollectionAdminRequest(CollectionParams.CollectionAction.MODIFYCOLLECTION) {
@@ -142,22 +199,16 @@ public class AutoAddReplicasPlanActionTest extends SolrCloudTestCase{
       }
     }.process(cluster.getSolrClient());
     lostJetty = jetty1;
-    lostNodeName = lostJetty.getNodeName();
+    String lostNodeName3 = lostJetty.getNodeName();
+    
     lostJetty.stop();
-    waitForNodeLeave(lostNodeName);
-    operations = getOperations(jetty3, lostNodeName);
+    
+    reader.waitForLiveNodes(30, TimeUnit.SECONDS, missingLiveNode(lostNodeName3));
+    
+    operations = getOperations(jetty3, lostNodeName3);
     assertNull(operations);
   }
 
-  private void waitForNodeLeave(String lostNodeName) throws InterruptedException {
-    ZkStateReader reader = cluster.getSolrClient().getZkStateReader();
-    TimeOut timeOut = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME);
-    while (reader.getClusterState().getLiveNodes().contains(lostNodeName)) {
-      Thread.sleep(100);
-      if (timeOut.hasTimedOut()) fail("Wait for " + lostNodeName + " to leave failed!");
-    }
-  }
-
   @SuppressForbidden(reason = "Needs currentTimeMillis to create unique id")
   private List<SolrRequest> getOperations(JettySolrRunner actionJetty, String lostNodeName) throws Exception {
     try (AutoAddReplicasPlanAction action = new AutoAddReplicasPlanAction()) {