You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2020/07/20 22:52:12 UTC

[lucene-solr] branch reference_impl updated: @257 - Fix some close, start on cluster shutdown.

This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/reference_impl by this push:
     new 22ef26e  @257 - Fix some close, start on cluster shutdown.
22ef26e is described below

commit 22ef26ef28f009e14d2d97a6f0188c128aef9230
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Mon Jul 20 17:50:10 2020 -0500

    @257 - Fix some close, start on cluster shutdown.
---
 .../client/solrj/embedded/JettySolrRunner.java     |   5 +-
 .../src/java/org/apache/solr/cloud/Overseer.java   |   4 +-
 .../core/src/java/org/apache/solr/cloud/ZkCLI.java |   2 +-
 .../java/org/apache/solr/cloud/ZkController.java   | 142 ++++++++++++++++++++-
 .../TestCloudPhrasesIdentificationComponent.java   |   4 -
 .../solr/cloud/TestCloudPseudoReturnFields.java    |   4 -
 .../apache/solr/cloud/TestRandomFlRTGCloud.java    |   4 -
 .../TestTolerantUpdateProcessorRandomCloud.java    |   3 -
 .../src/java/org/apache/solr/common/ParWork.java   |   3 +-
 .../org/apache/solr/common/cloud/SolrZkClient.java |   5 +
 .../apache/solr/common/cloud/ZkStateReader.java    |   8 +-
 .../org/apache/solr/common/util/CloseTracker.java  |  30 +++++
 .../solr/common/util/ObjectReleaseTracker.java     |   4 +
 .../apache/solr/cloud/MiniSolrCloudCluster.java    | 113 ++++++++++++----
 .../java/org/apache/solr/cloud/ZkTestServer.java   |  21 ++-
 15 files changed, 295 insertions(+), 57 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java b/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
index bc1f834..469d0d4 100644
--- a/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
+++ b/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
@@ -91,6 +91,7 @@ import org.eclipse.jetty.server.ServerConnector;
 import org.eclipse.jetty.server.SessionIdManager;
 import org.eclipse.jetty.server.SslConnectionFactory;
 import org.eclipse.jetty.server.handler.HandlerWrapper;
+import org.eclipse.jetty.server.handler.ShutdownHandler;
 import org.eclipse.jetty.server.handler.gzip.GzipHandler;
 import org.eclipse.jetty.server.session.DefaultSessionIdManager;
 import org.eclipse.jetty.server.session.HouseKeeper;
@@ -464,7 +465,9 @@ public class JettySolrRunner implements Closeable {
     }
 
     chain = injectJettyHandlers(chain);
-
+    ShutdownHandler shutdownHandler = new ShutdownHandler("solrrocks", true, true);
+    shutdownHandler.setHandler(chain);
+    chain = shutdownHandler;
     if(config.enableV2) {
       RewriteHandler rwh = new RewriteHandler();
       rwh.setHandler(chain);
diff --git a/solr/core/src/java/org/apache/solr/cloud/Overseer.java b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
index a925a27..a124a9d 100644
--- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java
+++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
@@ -655,7 +655,9 @@ public class Overseer implements SolrCloseable {
 
     updaterThread.start();
     ccThread.start();
-    triggerThread.start();
+    if (triggerThread != null) {
+      triggerThread.start();
+    }
 
     systemCollectionCompatCheck(new BiConsumer<String, Object>() {
       boolean firstPair = true;
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkCLI.java b/solr/core/src/java/org/apache/solr/cloud/ZkCLI.java
index 0650735..8477129 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkCLI.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkCLI.java
@@ -197,7 +197,7 @@ public class ZkCLI implements CLIO {
       SolrZkClient zkClient = null;
       CoreContainer cc = null;
       try {
-        zkClient = new SolrZkClient(zkServerAddress, 30000, 30000,
+        zkClient = new SolrZkClient(zkServerAddress, 30000, 10000,
             () -> {
             }).start();
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 0b8ba59..63b4448 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -22,8 +22,12 @@ import java.io.IOException;
 import java.io.PrintWriter;
 import java.io.UnsupportedEncodingException;
 import java.lang.invoke.MethodHandles;
+import java.net.HttpURLConnection;
 import java.net.InetAddress;
+import java.net.MalformedURLException;
 import java.net.NetworkInterface;
+import java.net.SocketException;
+import java.net.URL;
 import java.net.URLEncoder;
 import java.net.UnknownHostException;
 import java.nio.charset.StandardCharsets;
@@ -161,6 +165,9 @@ import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
 public class ZkController implements Closeable {
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public static final String CLUSTER_SHUTDOWN = "/cluster/shutdown";
+
   static final int WAIT_DOWN_STATES_TIMEOUT_SECONDS = 60;
   public final int WAIT_FOR_STATE = Integer.getInteger("solr.waitForState", 10);
 
@@ -339,7 +346,6 @@ public class ZkController implements Closeable {
     if (cc == null) log.error("null corecontainer");
     if (cc == null) throw new IllegalArgumentException("CoreContainer cannot be null.");
     try {
-      this.closeZkClient = true;
       this.cc = cc;
       this.descriptorsSupplier = descriptorsSupplier;
       this.cloudConfig = cloudConfig;
@@ -930,9 +936,137 @@ public class ZkController implements Closeable {
   }
 
   private void init() {
-    log.info("do init");
+    log.info("making shutdown watcher for cluster");
+    try {
+      zkClient.exists(CLUSTER_SHUTDOWN, new Watcher() {
+        @Override
+        public void process(WatchedEvent event) {
+          if (Event.EventType.None.equals(event.getType())) {
+            return;
+          }
+
+          log.info("Got even for shutdown {}" + event);
+          if (event.getType().equals(Event.EventType.NodeCreated)) {
+            log.info("Shutdown zk node created, shutting down");
+            shutdown();
+          } else {
+            log.info("Remaking shutdown watcher");
+            Stat stat = null;
+            try {
+              stat = zkClient.exists(CLUSTER_SHUTDOWN, this);
+            } catch (KeeperException e) {
+              SolrException.log(log, e);
+              return;
+            } catch (InterruptedException e) {
+              SolrException.log(log, e);
+              return;
+            }
+            if (stat != null) {
+              log.info("Got shutdown even while remaking watcher, shutting down");
+              shutdown();
+            }
+          }
+        }
+
+        private void shutdown() {
+          Thread updaterThead = overseer.getUpdaterThread();
+          log.info("Cluster shutdown initiated");
+          if (updaterThead!= null && updaterThead.isAlive()) {
+            log.info("We are the Overseer, wait for others to shutdown");
+
+            CountDownLatch latch = new CountDownLatch(1);
+            List<String> children = null;
+            try {
+              children = zkClient.getChildren("/solr" + ZkStateReader.LIVE_NODES_ZKNODE, new Watcher() {
+                @Override
+                public void process(WatchedEvent event) {
+                  if (Event.EventType.None.equals(event.getType())) {
+                    return;
+                  }
+                  if (event.getType() == Event.EventType.NodeChildrenChanged) {
+                    Thread updaterThead = overseer.getUpdaterThread();
+                    if (updaterThead == null || !updaterThead.isAlive()) {
+                      log.info("We were the Overseer, but it seems not anymore, shutting down");
+                      latch.countDown();
+                      return;
+                    }
+
+                    try {
+                      List<String> children = zkClient.getChildren("/solr" + ZkStateReader.LIVE_NODES_ZKNODE, this, false);
+                      if (children.size() == 1) {
+                        latch.countDown();
+                      }
+                    } catch (KeeperException e) {
+                      log.error("Exception on proper shutdown", e);
+                      return;
+                    } catch (InterruptedException e) {
+                      ParWork.propegateInterrupt(e);
+                      return;
+                    }
+                  }
+                }
+              }, false);
+            } catch (KeeperException e) {
+              log.error("Time out waiting to see solr live nodes go down " + children.size());
+              return;
+            } catch (InterruptedException e) {
+              ParWork.propegateInterrupt(e);
+              return;
+            }
+
+            if (children.size() > 1) {
+              boolean success = false;
+              try {
+                success = latch.await(10, TimeUnit.SECONDS);
+              } catch (InterruptedException e) {
+                ParWork.propegateInterrupt(e);
+              }
+              if (!success)  {
+                log.error("Time out waiting to see solr live nodes go down " + children.size());
+              }
+            }
+          }
+
+          ParWork.getExecutor().submit(new Runnable() {
+            @Override
+            public void run() {
+              try {
+                cc.close();
+              } catch (IOException e) {
+                log.error("IOException on shutdown", e);
+                return;
+              }
+              URL url = null;
+              try {
+                url = new URL(getHostName() + ":" + getHostPort() + "/shutdown?token=" + "solrrocks");
+              } catch (MalformedURLException e) {
+                SolrException.log(log, e);
+                return;
+              }
+              try {
+                HttpURLConnection connection = (HttpURLConnection) url.openConnection();
+                connection.setRequestMethod("POST");
+                connection.getResponseCode();
+                log.info("Shutting down " + url + ": " + connection.getResponseCode() + " " + connection.getResponseMessage());
+              } catch (SocketException e) {
+                SolrException.log(log, e);
+                // Okay - the server is not running
+              } catch (IOException e) {
+                SolrException.log(log, e);
+                return;
+              }
+            }
+          });
+        }
+      });
+    } catch (KeeperException e) {
+      throw new SolrException(ErrorCode.SERVER_ERROR, e);
+    } catch (InterruptedException e) {
+      ParWork.propegateInterrupt(e);
+      throw new SolrException(ErrorCode.SERVER_ERROR, e);
+    }
     try {
-      zkClient.mkdir("/cluster_lock");
+      zkClient.mkdirs("/cluster/cluster_lock");
     } catch (KeeperException.NodeExistsException e) {
       // okay
     } catch (KeeperException e) {
@@ -940,7 +1074,7 @@ public class ZkController implements Closeable {
     }
     boolean createdClusterNodes = false;
     try {
-      DistributedLock lock = new DistributedLock(zkClient, "/cluster_lock", zkClient.getZkACLProvider().getACLsToAdd("/cluster_lock"));
+      DistributedLock lock = new DistributedLock(zkClient, "/cluster/cluster_lock", zkClient.getZkACLProvider().getACLsToAdd("/cluster/cluster_lock"));
       if (log.isDebugEnabled()) log.debug("get cluster lock");
       while (!lock.lock()) {
         Thread.sleep(250);
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudPhrasesIdentificationComponent.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudPhrasesIdentificationComponent.java
index 708a39b..777e1e44 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestCloudPhrasesIdentificationComponent.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudPhrasesIdentificationComponent.java
@@ -108,10 +108,6 @@ public class TestCloudPhrasesIdentificationComponent extends SolrCloudTestCase {
 
   @AfterClass
   private static void afterClass() throws Exception {
-    if (null != CLOUD_CLIENT) {
-      CLOUD_CLIENT.close();
-      CLOUD_CLIENT = null;
-    }
     for (HttpSolrClient client : CLIENTS) {
       client.close();
     }
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudPseudoReturnFields.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudPseudoReturnFields.java
index 9ff2db3..3f8c68e 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestCloudPseudoReturnFields.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudPseudoReturnFields.java
@@ -108,10 +108,6 @@ public class TestCloudPseudoReturnFields extends SolrCloudTestCase {
   
   @AfterClass
   private static void afterClass() throws Exception {
-    if (null != CLOUD_CLIENT) {
-      CLOUD_CLIENT.close();
-      CLOUD_CLIENT = null;
-    }
     for (HttpSolrClient client : CLIENTS) {
       client.close();
     }
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestRandomFlRTGCloud.java b/solr/core/src/test/org/apache/solr/cloud/TestRandomFlRTGCloud.java
index 58cf472..a71ed48 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestRandomFlRTGCloud.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestRandomFlRTGCloud.java
@@ -155,10 +155,6 @@ public class TestRandomFlRTGCloud extends SolrCloudTestCase {
 
   @AfterClass
   private static void afterClass() throws Exception {
-    if (null != CLOUD_CLIENT) {
-      CLOUD_CLIENT.close();
-      CLOUD_CLIENT = null;
-    }
     for (HttpSolrClient client : CLIENTS) {
       client.close();
     }
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java
index 20c186c..748510b 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java
@@ -137,9 +137,6 @@ public class TestTolerantUpdateProcessorRandomCloud extends SolrCloudTestCase {
       }
     }
     NODE_CLIENTS = null;
-    if (CLOUD_CLIENT != null) {
-      CLOUD_CLIENT.close();
-    }
     CLOUD_CLIENT = null;
   }
 
diff --git a/solr/solrj/src/java/org/apache/solr/common/ParWork.java b/solr/solrj/src/java/org/apache/solr/common/ParWork.java
index 543303e..1161c1d 100644
--- a/solr/solrj/src/java/org/apache/solr/common/ParWork.java
+++ b/solr/solrj/src/java/org/apache/solr/common/ParWork.java
@@ -43,6 +43,7 @@ import java.util.concurrent.atomic.AtomicReference;
 
 import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.solr.client.solrj.impl.HttpClientUtil;
+import org.apache.solr.common.util.ObjectReleaseTracker;
 import org.apache.solr.common.util.OrderedExecutor;
 import org.apache.solr.common.util.SysStats;
 import org.apache.zookeeper.KeeperException;
@@ -667,7 +668,7 @@ public class ParWork implements Closeable {
       } else {
         if (ignoreExceptions) {
           warns.add(t);
-          log.error("Error", t);
+          log.error("Error handling close for an object", new ObjectReleaseTracker.ObjectTrackerException(t));
           if (t instanceof Error && !(t instanceof AssertionError)) {
             throw (Error) t;
           }
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java
index 922d0f8..058afa8 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java
@@ -61,6 +61,7 @@ import org.apache.solr.common.ParWorkExecutor;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.StringUtils;
 import org.apache.solr.common.cloud.ConnectionManager.IsClosed;
+import org.apache.solr.common.util.CloseTracker;
 import org.apache.solr.common.util.ExecutorUtil;
 import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.common.util.ObjectReleaseTracker;
@@ -97,6 +98,7 @@ public class SolrZkClient implements Closeable {
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
   private final int zkClientConnectTimeout;
+  private final CloseTracker closeTracker;
 
   private volatile ConnectionManager connManager;
 
@@ -124,6 +126,7 @@ public class SolrZkClient implements Closeable {
 
   // expert: for tests
   public SolrZkClient() {
+    closeTracker = new CloseTracker();
     zkClientConnectTimeout = 0;
   }
 
@@ -157,6 +160,7 @@ public class SolrZkClient implements Closeable {
   public SolrZkClient(String zkServerAddress, int zkClientTimeout, int clientConnectTimeout,
       ZkClientConnectionStrategy strat, final OnReconnect onReconnect, BeforeReconnect beforeReconnect, ZkACLProvider zkACLProvider, IsClosed higherLevelIsClosed) {
     ObjectReleaseTracker.track(this);
+    closeTracker = new CloseTracker();
     this.zkServerAddress = zkServerAddress;
     this.higherLevelIsClosed = higherLevelIsClosed;
     if (strat == null) {
@@ -854,6 +858,7 @@ public class SolrZkClient implements Closeable {
   }
 
   public void close() {
+    closeTracker.close();
     if (isClosed) return; // it's okay if we over close - same as solrcore
     isClosed = true;
 
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
index 55d1b3a..98b2938 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
@@ -55,6 +55,7 @@ import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.params.AutoScalingParams;
 import org.apache.solr.common.params.CollectionAdminParams;
 import org.apache.solr.common.params.CoreAdminParams;
+import org.apache.solr.common.util.CloseTracker;
 import org.apache.solr.common.util.ExecutorUtil;
 import org.apache.solr.common.util.ObjectReleaseTracker;
 import org.apache.solr.common.util.Pair;
@@ -156,6 +157,7 @@ public class ZkStateReader implements SolrCloseable {
   private static final String SOLR_ENVIRONMENT = "environment";
 
   public static final String REPLICA_TYPE = "type";
+  private final CloseTracker closeTracker;
 
   /**
    * A view of the current state of all collections; combines all the different state sources into a single view.
@@ -337,6 +339,7 @@ public class ZkStateReader implements SolrCloseable {
   }
 
   public ZkStateReader(SolrZkClient zkClient, Runnable securityNodeListener) {
+    closeTracker = new CloseTracker();
     this.zkClient = zkClient;
     this.configManager = new ZkConfigManager(zkClient);
     this.closeClient = false;
@@ -346,6 +349,7 @@ public class ZkStateReader implements SolrCloseable {
 
 
   public ZkStateReader(String zkServerAddress, int zkClientTimeout, int zkClientConnectTimeout) {
+    closeTracker = new CloseTracker();
     this.zkClient = new SolrZkClient(zkServerAddress, zkClientTimeout, zkClientConnectTimeout,
         // on reconnect, reload cloud info
         new OnReconnect() {
@@ -416,7 +420,8 @@ public class ZkStateReader implements SolrCloseable {
       public void process(WatchedEvent event) {
         if (EventType.None.equals(event.getType())) {
           return;
-        }   log.info("Got event on live node watcher {}", event.toString());
+        }
+        log.info("Got event on live node watcher {}", event.toString());
         if (event.getType() == EventType.NodeCreated) {
           latch.countDown();
         } else {
@@ -891,6 +896,7 @@ public class ZkStateReader implements SolrCloseable {
   }
 
   public void close() {
+    closeTracker.close();
     this.closed = true;
 
     try (ParWork closer = new ParWork(this)) {
diff --git a/solr/solrj/src/java/org/apache/solr/common/util/CloseTracker.java b/solr/solrj/src/java/org/apache/solr/common/util/CloseTracker.java
new file mode 100644
index 0000000..f0b0a77
--- /dev/null
+++ b/solr/solrj/src/java/org/apache/solr/common/util/CloseTracker.java
@@ -0,0 +1,30 @@
+package org.apache.solr.common.util;
+
+import org.apache.commons.io.output.StringBuilderWriter;
+import org.apache.solr.common.AlreadyClosedException;
+
+import java.io.Closeable;
+import java.io.PrintWriter;
+
+public class CloseTracker implements Closeable {
+    private volatile boolean closed = false;
+    private volatile String closeStack = "";
+
+    @Override
+    public void close() {
+        if (closed) {
+            throw new AlreadyClosedException(closeStack);
+        }
+
+        StringBuilderWriter sw = new StringBuilderWriter(4096);
+        PrintWriter pw = new PrintWriter(sw);
+        new ObjectReleaseTracker.ObjectTrackerException(this.getClass().getName()).printStackTrace(pw);
+        closeStack = sw.toString();
+        closed = true;
+    }
+
+    public boolean isClosed() {
+        return closed;
+    }
+
+}
diff --git a/solr/solrj/src/java/org/apache/solr/common/util/ObjectReleaseTracker.java b/solr/solrj/src/java/org/apache/solr/common/util/ObjectReleaseTracker.java
index 36024b0..0b4d40b 100644
--- a/solr/solrj/src/java/org/apache/solr/common/util/ObjectReleaseTracker.java
+++ b/solr/solrj/src/java/org/apache/solr/common/util/ObjectReleaseTracker.java
@@ -80,6 +80,10 @@ public class ObjectReleaseTracker {
     public ObjectTrackerException(String msg) {
       super(msg);
     }
+
+    public ObjectTrackerException(Throwable t) {
+      super(t);
+    }
   }
 
 }
diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java
index 91766a0..27e501e 100644
--- a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java
+++ b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java
@@ -77,6 +77,8 @@ import org.apache.solr.core.CoreContainer;
 import org.apache.solr.handler.admin.CollectionsHandler;
 import org.apache.solr.util.TimeOut;
 import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.WatchedEvent;
+import org.apache.zookeeper.Watcher;
 import org.eclipse.jetty.server.handler.HandlerWrapper;
 import org.eclipse.jetty.servlet.ServletHolder;
 import org.slf4j.Logger;
@@ -143,6 +145,7 @@ public class MiniSolrCloudCluster {
       "</solr>\n";
 
   private final Object startupWait = new Object();
+  private final SolrZkClient solrZkClient;
   private volatile ZkTestServer zkServer; // non-final due to injectChaos()
   private final boolean externalZkServer;
   private final List<JettySolrRunner> jettys = new CopyOnWriteArrayList<>();
@@ -332,7 +335,7 @@ public class MiniSolrCloudCluster {
 
       // build the client
       solrClient = buildSolrClient();
-
+      solrZkClient = solrClient.getZkStateReader().getZkClient();
       if (numServers > 0) {
         waitForAllNodes(numServers, STARTUP_WAIT_SECONDS);
       }
@@ -651,20 +654,75 @@ public class MiniSolrCloudCluster {
    */
   public void shutdown() throws Exception {
     try {
+      log.info("creating cluster shutdown zk node");
+      zkServer.getZkClient().mkdirs("/solr" + ZkController.CLUSTER_SHUTDOWN);
+      zkServer.getZkClient().printLayout();
+      zkServer.getZkClient().printLayoutToStream(System.out);
+
+      CountDownLatch latch = new CountDownLatch(1);
+      List<String> children = zkServer.getZkClient().getChildren("/solr" + ZkStateReader.LIVE_NODES_ZKNODE, new Watcher() {
+        @Override
+        public void process(WatchedEvent event) {
+          if (Event.EventType.None.equals(event.getType())) {
+            return;
+          }
+          if (event.getType() == Event.EventType.NodeChildrenChanged) {
+            try {
+              List<String> children = zkServer.getZkClient().getChildren("/solr" + ZkStateReader.LIVE_NODES_ZKNODE, this, false);
+              if (children.size() == 0) {
+                latch.countDown();
+              }
+            } catch (KeeperException e) {
+              log.error("Exception on proper shutdown", e);
+              return;
+            } catch (InterruptedException e) {
+              ParWork.propegateInterrupt(e);
+              return;
+            }
+          }
+        }
+      }, false);
+
+      if (children.size() > 0) {
+        boolean success = latch.await(10, TimeUnit.SECONDS);
+        if (!success)  {
+          throw new TimeoutException("Time out waiting to see solr live nodes go down " + children.size());
+        }
+      }
+
+    } catch (KeeperException.NodeExistsException e) {
+      log.info("Shutdown zk node already exists");
+    } catch (Exception e) {
+      log.error("Exception on proper shutdown", e);
+    }
+
+
+    //ZkStateReader reader = zkServer.getZkClient().getZkStateReader();
+
+//    try {
+//      reader.waitForLiveNodes(10, TimeUnit.SECONDS, (o, n) -> n.size() == 0);
+//    } catch (InterruptedException e) {
+//      Thread.currentThread().interrupt();
+//      throw new SolrException(ErrorCode.SERVER_ERROR, "interrupted");
+//    }
+   // Thread.sleep(40000);
+
+    try {
       List<Callable<JettySolrRunner>> shutdowns = new ArrayList<>(jettys.size());
       for (final JettySolrRunner jetty : jettys) {
-        shutdowns.add(() -> stopJettySolrRunner(jetty, false));
+        shutdowns.add(() -> stopJettySolrRunner(jetty, true));
       }
       jettys.clear();
 
       try (ParWork parWork = new ParWork(this, true)) {
-        parWork.collect(solrClient);
         parWork.collect(shutdowns);
-        parWork.addCollect("jetties&solrClient");
+        parWork.addCollect("jetties");
+        parWork.collect(solrClient);
+        parWork.addCollect("solrClient");
         if (!externalZkServer) {
           parWork.collect(zkServer);
+          parWork.addCollect("zkServer");
         }
-        parWork.addCollect("zkServer");
       }
     } finally {
       System.clearProperty("zkHost");
@@ -682,7 +740,7 @@ public class MiniSolrCloudCluster {
   }
 
   public SolrZkClient getZkClient() {
-    return solrClient.getZkStateReader().getZkClient();
+    return solrZkClient;
   }
   
   protected CloudSolrClient buildSolrClient() {
@@ -759,28 +817,31 @@ public class MiniSolrCloudCluster {
     }
   }
 
-  public synchronized void injectChaos(Random random) throws Exception {
-
-    // sometimes we restart one of the jetty nodes
-    if (random.nextBoolean()) {
-      JettySolrRunner jetty = jettys.get(random.nextInt(jettys.size()));
-      jetty.stop();
-      log.info("============ Restarting jetty");
-      jetty.start();
-    }
+  public  void injectChaos(Random random) throws Exception {
+    if (LuceneTestCase.TEST_NIGHTLY && false) { // nocommit
+      synchronized (this) {
+        // sometimes we restart one of the jetty nodes
+        if (random.nextBoolean()) {
+          JettySolrRunner jetty = jettys.get(random.nextInt(jettys.size()));
+          jetty.stop();
+          log.info("============ Restarting jetty");
+          jetty.start();
+        }
 
-    // sometimes we restart zookeeper
-    if (random.nextBoolean()) {
-      zkServer.shutdown();
-      log.info("============ Restarting zookeeper");
-      zkServer = new ZkTestServer(zkServer.getZkDir(), zkServer.getPort());
-      zkServer.run(false);
-    }
+        // sometimes we restart zookeeper
+        if (random.nextBoolean()) {
+          zkServer.shutdown();
+          log.info("============ Restarting zookeeper");
+          zkServer = new ZkTestServer(zkServer.getZkDir(), zkServer.getPort());
+          zkServer.run(false);
+        }
 
-    // sometimes we cause a connection loss - sometimes it will hit the overseer
-    if (random.nextBoolean()) {
-      JettySolrRunner jetty = jettys.get(random.nextInt(jettys.size()));
-      ChaosMonkey.causeConnectionLoss(jetty);
+        // sometimes we cause a connection loss - sometimes it will hit the overseer
+        if (random.nextBoolean()) {
+          JettySolrRunner jetty = jettys.get(random.nextInt(jettys.size()));
+          ChaosMonkey.causeConnectionLoss(jetty);
+        }
+      }
     }
   }
 
diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/ZkTestServer.java b/solr/test-framework/src/java/org/apache/solr/cloud/ZkTestServer.java
index f633aba..bbdc485 100644
--- a/solr/test-framework/src/java/org/apache/solr/cloud/ZkTestServer.java
+++ b/solr/test-framework/src/java/org/apache/solr/cloud/ZkTestServer.java
@@ -49,6 +49,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
 import javax.management.JMException;
 
 import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.AlreadyClosedException;
 import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.SolrZkClient;
@@ -56,6 +57,7 @@ import org.apache.solr.common.cloud.ZkMaintenanceUtils;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.cloud.ZooKeeperException;
+import org.apache.solr.common.util.CloseTracker;
 import org.apache.solr.common.util.ObjectReleaseTracker;
 import org.apache.solr.common.util.TimeOut;
 import org.apache.solr.common.util.TimeSource;
@@ -97,6 +99,8 @@ public class ZkTestServer implements Closeable {
     }
   }
 
+  private volatile CloseTracker closeTracker;
+
   private Path zkMonitoringFile;
 
   public static final int TIMEOUT = 45000;
@@ -543,18 +547,21 @@ public class ZkTestServer implements Closeable {
     run(false);
   }
 
-  public void run(boolean solrFormat) throws InterruptedException, IOException {
+  public synchronized void run(boolean solrFormat) throws InterruptedException, IOException {
     log.info("STARTING ZK TEST SERVER dataDir={}", this.zkDir);
     // docs say no config for netty yet
    // System.setProperty("zookeeper.serverCnxnFactory", "org.apache.zookeeper.server.NettyServerCnxnFactory");
    // System.setProperty("zookeeper.clientCnxnSocket", "org.apache.zookeeper.ClientCnxnSocketNetty");
 
-
+    if (zooThread != null) {
+      throw new AlreadyClosedException();
+    }
+    if (closeTracker != null) {
+      throw new AlreadyClosedException();
+    }
+    closeTracker = new CloseTracker();
 
     try {
-      if (zooThread != null) {
-        throw new IllegalStateException("ZK TEST SERVER IS ALREADY RUNNING");
-      }
       // we don't call super.distribSetUp
       zooThread = new Thread("ZkTestServer Run Thread") {
 
@@ -620,9 +627,9 @@ public class ZkTestServer implements Closeable {
     }
   }
 
-  public void shutdown() throws IOException, InterruptedException {
+  public synchronized void shutdown() throws IOException, InterruptedException {
     log.info("Shutting down ZkTestServer.");
-
+    closeTracker.close();
     try {
       if (chRootClient != null) {
         chRootClient.printLayout();