You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2021/01/29 20:38:59 UTC

[lucene-solr] 01/12: @1290 Start to prepare for early access production.

This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit b3ed5a879847d2fd0cfb3c2043343b7c078d2178
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Mon Jan 25 06:52:07 2021 -0600

    @1290 Start to prepare for early access production.
---
 .../apache/lucene/store/NRTCachingDirectory.java   |   4 +-
 solr/bin/jetty.sh                                  |   4 +-
 solr/bin/solr                                      |  39 ++-
 .../client/solrj/embedded/JettySolrRunner.java     |  11 +-
 .../java/org/apache/solr/cloud/DistributedMap.java |   8 +-
 .../org/apache/solr/cloud/ElectionContext.java     |   8 +-
 .../java/org/apache/solr/cloud/LeaderElector.java  |  86 +++--
 .../src/java/org/apache/solr/cloud/Overseer.java   | 137 +++++---
 .../apache/solr/cloud/OverseerElectionContext.java |  13 +-
 .../apache/solr/cloud/OverseerTaskProcessor.java   |   8 +-
 .../org/apache/solr/cloud/OverseerTaskQueue.java   |   8 +-
 .../solr/cloud/RecoveringCoreTermWatcher.java      |   2 +-
 .../org/apache/solr/cloud/RecoveryStrategy.java    | 140 ++++----
 .../solr/cloud/ShardLeaderElectionContext.java     |  56 ++--
 .../solr/cloud/ShardLeaderElectionContextBase.java |  13 +-
 .../solr/cloud/SizeLimitedDistributedMap.java      |   7 +-
 .../java/org/apache/solr/cloud/StatePublisher.java |  43 ++-
 .../org/apache/solr/cloud/ZkCollectionTerms.java   |  10 +-
 .../java/org/apache/solr/cloud/ZkController.java   | 313 +++++++-----------
 .../org/apache/solr/cloud/ZkDistributedQueue.java  |   8 +-
 .../java/org/apache/solr/cloud/ZkShardTerms.java   | 106 +++---
 .../apache/solr/cloud/ZkSolrResourceLoader.java    |   2 +-
 .../cloud/api/collections/CreateCollectionCmd.java |  20 +-
 .../cloud/api/collections/DeleteCollectionCmd.java |  13 +-
 .../OverseerCollectionMessageHandler.java          |   3 +-
 .../apache/solr/cloud/overseer/OverseerAction.java |   5 +-
 .../apache/solr/cloud/overseer/ZkStateWriter.java  | 325 ++++++++++---------
 .../apache/solr/core/CachingDirectoryFactory.java  | 174 +++-------
 .../java/org/apache/solr/core/CoreContainer.java   | 340 ++++++++++----------
 .../apache/solr/core/CorePropertiesLocator.java    |   4 +-
 .../src/java/org/apache/solr/core/SolrCore.java    |  27 +-
 .../src/java/org/apache/solr/core/SolrCores.java   |  19 +-
 .../src/java/org/apache/solr/core/ZkContainer.java |   2 +-
 .../java/org/apache/solr/handler/IndexFetcher.java |  22 +-
 .../apache/solr/handler/RequestHandlerBase.java    |   2 +-
 .../solr/handler/admin/CollectionsHandler.java     |   7 +-
 .../solr/handler/admin/CoreAdminOperation.java     |   9 +-
 .../apache/solr/handler/admin/PrepRecoveryOp.java  |  14 +-
 .../org/apache/solr/handler/admin/SplitOp.java     |   1 +
 .../handler/component/RealTimeGetComponent.java    |  23 +-
 .../org/apache/solr/metrics/SolrMetricManager.java |   2 +-
 .../java/org/apache/solr/pkg/PackageListeners.java |   2 +-
 .../java/org/apache/solr/schema/IndexSchema.java   |  10 +-
 .../org/apache/solr/schema/ManagedIndexSchema.java |   4 +-
 .../apache/solr/schema/ZkIndexSchemaReader.java    |   8 +-
 .../java/org/apache/solr/servlet/HttpSolrCall.java |  14 +-
 .../apache/solr/servlet/SolrDispatchFilter.java    |  11 +-
 .../org/apache/solr/servlet/SolrQoSFilter.java     | 101 +++---
 .../java/org/apache/solr/servlet/StopJetty.java    |   2 +-
 .../org/apache/solr/update/AddUpdateCommand.java   |  10 +-
 .../java/org/apache/solr/update/CommitTracker.java |   2 +-
 .../apache/solr/update/DefaultSolrCoreState.java   |  13 +-
 .../java/org/apache/solr/update/HdfsUpdateLog.java |   3 +-
 .../src/java/org/apache/solr/update/PeerSync.java  |  83 ++---
 .../org/apache/solr/update/PeerSyncWithLeader.java |  45 +--
 .../org/apache/solr/update/SolrCmdDistributor.java |   8 +-
 .../java/org/apache/solr/update/SolrCoreState.java |   4 +-
 .../org/apache/solr/update/SolrIndexSplitter.java  |   2 +-
 .../org/apache/solr/update/SolrIndexWriter.java    |  22 +-
 .../src/java/org/apache/solr/update/UpdateLog.java | 111 ++++---
 .../AddSchemaFieldsUpdateProcessorFactory.java     |   4 +-
 .../processor/DistributedUpdateProcessor.java      |  61 ++--
 .../processor/DistributedZkUpdateProcessor.java    |  15 +-
 .../solr/util/plugin/AbstractPluginLoader.java     |   7 +-
 solr/core/src/test-files/log4j2.xml                |   2 +-
 .../src/test/org/apache/solr/CursorPagingTest.java |   6 +-
 .../org/apache/solr/cloud/DeleteReplicaTest.java   |  42 +--
 .../org/apache/solr/cloud/LeaderElectionTest.java  |   7 +-
 .../solr/cloud/MissingSegmentRecoveryTest.java     |   2 +
 .../test/org/apache/solr/cloud/OverseerTest.java   |   2 +-
 .../org/apache/solr/cloud/TestDistributedMap.java  |  14 +-
 .../solr/cloud/TestSizeLimitedDistributedMap.java  |   5 +-
 .../CollectionsAPIDistributedZkTest.java           |   1 +
 .../CreateCollectionsIndexAndRestartTest.java      |   3 +-
 solr/server/etc/jetty-https.xml                    |   2 +-
 solr/server/etc/jetty-https8.xml                   |   5 +
 solr/server/etc/jetty.xml                          |   8 +-
 solr/server/resources/log4j2.xml                   |  12 +-
 .../solr/client/solrj/impl/Http2SolrClient.java    |  20 +-
 .../solr/client/solrj/impl/LBHttp2SolrClient.java  |   2 +-
 .../solr/client/solrj/impl/LBHttpSolrClient.java   |   1 +
 .../solr/client/solrj/impl/LBSolrClient.java       |  16 +-
 .../src/java/org/apache/solr/common/ParWork.java   |  29 +-
 .../org/apache/solr/common/ParWorkExecutor.java    |   4 +-
 .../apache/solr/common/PerThreadExecService.java   |  72 ++---
 .../java/org/apache/solr/common/cloud/Replica.java |   4 +
 .../apache/solr/common/cloud/SolrZooKeeper.java    |   6 +-
 .../apache/solr/common/cloud/ZkStateReader.java    | 354 ++++++++++-----------
 .../solr/common/util/SolrQueuedThreadPool.java     |   6 +-
 .../java/org/apache/solr/common/util/SysStats.java |  23 +-
 .../org/apache/solr/logging/MDCLoggingContext.java | 105 +-----
 .../org/apache/zookeeper/ZooKeeperExposed.java     |   2 +-
 .../src/java/org/apache/solr/SolrTestCase.java     |   1 +
 .../src/resources/logconf/log4j2-startup-debug.xml |   8 +-
 .../src/resources/logconf/log4j2-std-debug.xml     |   9 +-
 95 files changed, 1680 insertions(+), 1673 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java b/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java
index ec7e5b9..fee7f51 100644
--- a/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java
+++ b/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java
@@ -212,7 +212,7 @@ public class NRTCachingDirectory extends FilterDirectory implements Accountable
     // it for defensive reasons... or in case the app is
     // doing something custom (creating outputs directly w/o
     // using IndexWriter):
-    if (Boolean.getBoolean("solr.nrtDirSync")) { // nocommit)
+  //  if (Boolean.getBoolean("solr.nrtDirSync")) { // nocommit)
       IOUtils.close(() -> {
         if (!closed.getAndSet(true)) {
           for (String fileName : cacheDirectory.listAll()) {
@@ -220,7 +220,7 @@ public class NRTCachingDirectory extends FilterDirectory implements Accountable
           }
         }
       }, cacheDirectory, in);
-    }
+  //  }
   }
 
   /** Subclass can override this to customize logic; return
diff --git a/solr/bin/jetty.sh b/solr/bin/jetty.sh
index e30a504..f9b3f53 100644
--- a/solr/bin/jetty.sh
+++ b/solr/bin/jetty.sh
@@ -137,8 +137,8 @@ started()
     [ -z "$(grep STARTED $1 2>/dev/null)" ] || return 0
     [ -z "$(grep STOPPED $1 2>/dev/null)" ] || return 1
     [ -z "$(grep FAILED $1 2>/dev/null)" ] || return 1
-    #local PID=$(cat "$2" 2>/dev/null) || return 1
-    #kill -0 "$PID" 2>/dev/null || return 1
+    local PID=$(cat "$2" 2>/dev/null) || return 1
+    kill -0 "$PID" 2>/dev/null || return 1
     echo -n ". "
     sleep .3
   done
diff --git a/solr/bin/solr b/solr/bin/solr
index 3216d78..0288095 100755
--- a/solr/bin/solr
+++ b/solr/bin/solr
@@ -829,6 +829,18 @@ function run_package() {
   #exit $?
 }
 
+function running()
+{
+  if [ -f "$1" ]
+  then
+    local PID=$(cat "$1" 2>/dev/null) || return 1
+    kill -0 "$PID" 2>/dev/null
+    return
+  fi
+  rm -f "$1"
+  return 1
+}
+
 # tries to gracefully stop Solr using the Jetty
 # stop command and if that fails, then uses kill -9
 # (will attempt to jstack before killing)
@@ -845,8 +857,31 @@ function stop_solr() {
 
   echo -e "Sending stop command to Solr running on port $SOLR_PORT ...  "$JAVA" -cp $SOLR_TIP/server/lib/ext/solr-core*.jar $SOLR_SSL_OPTS $AUTHC_OPTS org.apache.solr.servlet.StopJetty "-DSTOP.PORT=$THIS_STOP_PORT" "-DSTOP.KEY=$STOP_KEY""
   "$JAVA" -cp $SOLR_TIP/server/lib/ext/solr-core*.jar $SOLR_SSL_OPTS $AUTHC_OPTS "-DSTOP.PORT=$THIS_STOP_PORT" "-DSTOP.KEY=$STOP_KEY" org.apache.solr.servlet.StopJetty || true
+
+  if [ ! -f "$SOLR_PID_DIR/$JETTY_PID" ] ; then
+    echo "ERROR: no pid found at $SOLR_PID_DIR/$JETTY_PID"
+    exit 1
+  fi
   PID=$(cat $SOLR_PID_DIR/$JETTY_PID)
-  rm $SOLR_PID_DIR/$JETTY_PID
+
+  if [ -z "$PID" ] ; then
+    echo "ERROR: no pid id found in $PID"
+    exit 1
+  fi
+
+  TIMEOUT=30
+  while running $PID; do
+    if (( TIMEOUT-- == 0 )); then
+       kill -KILL "$PID" 2>/dev/null
+    fi
+
+    usleep 300000
+  done
+
+  rm -f "$SOLR_PID_DIR/$JETTY_PID"
+  rm -f "${SOLR_HOME}/jetty.state"
+  usleep 300000
+  echo OK
 } # end stop_solr
 
 if [ $# -eq 1 ]; then
@@ -1904,7 +1939,7 @@ if [ -z ${GC_LOG_OPTS+x} ]; then
                  '-XX:+PrintGCDateStamps' '-XX:+PrintGCTimeStamps' '-XX:+PrintTenuringDistribution' \
                  '-XX:+PrintGCApplicationStoppedTime')
   else
-    GC_LOG_OPTS=('-Xlog:gc*')
+    GC_LOG_OPTS=('')
   fi
 else
   GC_LOG_OPTS=($GC_LOG_OPTS)
diff --git a/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java b/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
index 96c7fb0..34faa29 100644
--- a/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
+++ b/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
@@ -411,19 +411,20 @@ public class JettySolrRunner implements Closeable {
             for (Map.Entry<ServletHolder,String> entry : config.extraServlets.entrySet()) {
               root.addServlet(entry.getKey(), entry.getValue());
             }
+
+           // qosFilter = root.getServletHandler().newFilterHolder(Source.EMBEDDED);
+           // qosFilter.setHeldClass(SolrQoSFilter.class);
+           // qosFilter.setAsyncSupported(true);
+
             dispatchFilter = root.getServletHandler().newFilterHolder(Source.EMBEDDED);
             dispatchFilter.setHeldClass(SolrDispatchFilter.class);
             dispatchFilter.setInitParameter("excludePatterns", excludePatterns);
 
-            qosFilter = root.getServletHandler().newFilterHolder(Source.EMBEDDED);
-            qosFilter.setHeldClass(SolrQoSFilter.class);
-            qosFilter.setAsyncSupported(true);
-            root.addFilter(qosFilter, "*", EnumSet.of(DispatcherType.REQUEST, DispatcherType.ASYNC));
-
             root.addServlet(Servlet404.class, "/*");
 
             // Map dispatchFilter in same path as in web.xml
             dispatchFilter.setAsyncSupported(true);
+           // root.addFilter(qosFilter, "*", EnumSet.of(DispatcherType.REQUEST, DispatcherType.ASYNC));
             root.addFilter(dispatchFilter, "*", EnumSet.of(DispatcherType.REQUEST, DispatcherType.ASYNC));
 
             if (log.isDebugEnabled()) log.debug("Jetty loaded and ready to go");
diff --git a/solr/core/src/java/org/apache/solr/cloud/DistributedMap.java b/solr/core/src/java/org/apache/solr/cloud/DistributedMap.java
index 5224290..beb3568 100644
--- a/solr/core/src/java/org/apache/solr/cloud/DistributedMap.java
+++ b/solr/core/src/java/org/apache/solr/cloud/DistributedMap.java
@@ -51,17 +51,17 @@ public class DistributedMap {
 
   public void update(String trackingId, byte[] data) throws KeeperException, InterruptedException {
     String path = dir + "/" + PREFIX + trackingId;
-    log.info("set data in distmap {}", path);
+    if (log.isDebugEnabled()) log.debug("set data in distmap {}", path);
     if (data == null || data.length == 0) {
       throw new IllegalArgumentException();
     }
     zookeeper.setData(path, data, true);
   }
 
-  public void put(String trackingId, byte[] data) throws KeeperException, InterruptedException {
+  public void put(String trackingId, byte[] data, CreateMode createMode) throws KeeperException, InterruptedException {
     String path = dir + "/" + PREFIX + trackingId;
-    log.info("put in distmap {}", path);
-    zookeeper.makePath(path, data, CreateMode.PERSISTENT, null, false, true);
+    if (log.isDebugEnabled()) log.debug("put in distmap {}", path);
+    zookeeper.makePath(path, data, createMode, null, false, true);
   }
   
   /**
diff --git a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
index e0d775e..8774bb6 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
@@ -16,12 +16,11 @@
  */
 package org.apache.solr.cloud;
 
-import java.io.Closeable;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 
 import org.apache.solr.common.cloud.Replica;
-import org.apache.solr.common.util.ObjectReleaseTracker;
+import org.apache.solr.core.CoreDescriptor;
 import org.apache.zookeeper.KeeperException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -30,18 +29,19 @@ public abstract class ElectionContext {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
   protected final String electionPath;
   protected final Replica leaderProps;
+  protected final CoreDescriptor cd;
   protected final String id;
   protected final String leaderPath;
   protected volatile String leaderSeqPath;
   protected volatile String watchedSeqPath;
 
 
-  public ElectionContext(final String id, final String electionPath, final String leaderPath, final Replica leaderProps) {
+  public ElectionContext(final String id, final String electionPath, final String leaderPath, final Replica leaderProps, CoreDescriptor cd) {
     this.id = id;
     this.electionPath = electionPath;
     this.leaderPath = leaderPath;
     this.leaderProps = leaderProps;
-
+    this.cd = cd;
   }
 
   protected void cancelElection() throws InterruptedException, KeeperException {
diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
index c9b7ffe..5d84340 100644
--- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
+++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
@@ -128,6 +128,10 @@ public class LeaderElector implements Closeable {
         return false;
       }
 
+      if (state == LEADER || state == POT_LEADER) {
+        return false;
+      }
+
       executor.submit(() -> {
         context.checkIfIamLeaderFired();
       });
@@ -155,7 +159,7 @@ public class LeaderElector implements Closeable {
         // we couldn't set our watch for some other reason, retry
         log.error("Failed on election getchildren call {} {}", e.getClass().getName(), e.getMessage());
         state = OUT_OF_ELECTION;
-        return true;
+        return false;
       }
 
       try {
@@ -184,11 +188,11 @@ public class LeaderElector implements Closeable {
             oldWatcher.close();
           }
 
-          if ((zkController != null && zkController.getCoreContainer().isShutDown())) {
-            if (log.isDebugEnabled()) log.debug("Elector is closed, will not try and run leader processes");
-            state = OUT_OF_ELECTION;
-            return false;
-          }
+//          if ((zkController != null && zkController.getCoreContainer().isShutDown())) {
+//            if (log.isDebugEnabled()) log.debug("Elector is closed, will not try and run leader processes");
+//            state = OUT_OF_ELECTION;
+//            return false;
+//          }
 
           state = POT_LEADER;
           runIamLeaderProcess(context, replacement);
@@ -237,8 +241,10 @@ public class LeaderElector implements Closeable {
             log.warn("Failed setting election watch, retrying {} {}", e.getClass().getName(), e.getMessage());
             state = OUT_OF_ELECTION;
             return true;
-          } catch (Exception e) {
+          } catch (AlreadyClosedException e) {
             state = OUT_OF_ELECTION;
+            return false;
+          } catch (Exception e) {
             // we couldn't set our watch for some other reason, retry
             log.error("Failed setting election watch {} {}", e.getClass().getName(), e.getMessage());
             state = OUT_OF_ELECTION;
@@ -252,10 +258,11 @@ public class LeaderElector implements Closeable {
         return true;
       } catch (AlreadyClosedException e) {
         state = OUT_OF_ELECTION;
-        return true;
+        return false;
       } catch (Exception e) {
+        log.error("Exception", e);
         state = OUT_OF_ELECTION;
-        return true;
+        return false;
       }
 
     } finally {
@@ -267,19 +274,24 @@ public class LeaderElector implements Closeable {
   // TODO: get this core param out of here
   protected void runIamLeaderProcess(final ElectionContext context, boolean weAreReplacement) throws KeeperException,
           InterruptedException, IOException {
-    if (state == CLOSED) {
-      throw new AlreadyClosedException();
-    }
-    if (state == LEADER) {
-      throw new IllegalStateException("Already in leader state");
-    }
-
-    boolean success = context.runLeaderProcess(context, weAreReplacement, 0);
-
-    if (success) {
-      state = LEADER;
-    } else {
-      state = OUT_OF_ELECTION;
+//    if (state == CLOSED) {
+//      throw new AlreadyClosedException();
+//    }
+//    if (state == LEADER) {
+//      throw new IllegalStateException("Already in leader state");
+//    }
+    boolean success = false;
+    try {
+      success = context.runLeaderProcess(context, weAreReplacement, 0);
+      if (success) {
+        state = LEADER;
+      } else {
+        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Failed becoming leader");
+      }
+    } finally {
+      if (!success) {
+        state = OUT_OF_ELECTION;
+      }
     }
   }
 
@@ -325,11 +337,15 @@ public class LeaderElector implements Closeable {
   public void joinElection(boolean replacement,boolean joinAtHead) {
     if (!isClosed && !zkController.getCoreContainer().isShutDown() && !zkController.isDcCalled()) {
       joinFuture = executor.submit(() -> {
+        MDCLoggingContext.setCoreName(context.leaderProps.getName());
+        MDCLoggingContext.setNode(zkController.getNodeName());
         try {
           isCancelled = false;
           doJoinElection(context, replacement, joinAtHead);
         } catch (Exception e) {
           log.error("Exception trying to join election", e);
+        } finally {
+          MDCLoggingContext.clear();
         }
       });
     }
@@ -346,12 +362,13 @@ public class LeaderElector implements Closeable {
   public synchronized void doJoinElection(ElectionContext context, boolean replacement,boolean joinAtHead) throws KeeperException, InterruptedException, IOException {
     //if (checkClosed(context)) return false;
     if (shouldRejectJoins() || state == CLOSED) {
-      log.info("elector is closed, won't join election");
+      log.info("Won't join election {}", state);
       throw new AlreadyClosedException();
     }
 
-    if (state != OUT_OF_ELECTION) {
-      throw new IllegalStateException("Expected " + OUT_OF_ELECTION + " but got " + state);
+    if (state == LEADER) {
+      log.error("Wrong state",new IllegalStateException("Got " + state));
+      throw new IllegalStateException("Wrong state",new IllegalStateException("Got " + state));
     }
     state = JOIN;
 
@@ -421,12 +438,13 @@ public class LeaderElector implements Closeable {
         // we must have failed in creating the election node - someone else must
         // be working on it, lets try again
         log.info("No node found during election {} " + e.getMessage(), e.getPath());
-        if (tries++ > 5) {
-          log.error("No node found during election {} " + e.getMessage(), e.getPath());
-          throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR,
-              "", e);
-        }
-        cont = true;
+//        if (tries++ > 5) {
+//          log.error("No node found during election {} " + e.getMessage(), e.getPath());
+//          throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR,
+//              "", e);
+//        }
+//        cont = true;
+        throw new AlreadyClosedException();
       }
     }
 
@@ -512,6 +530,10 @@ public class LeaderElector implements Closeable {
     return isClosed;
   }
 
+  public String getState() {
+    return state;
+  }
+
   private class ElectionWatcher implements Watcher, Closeable {
     final String myNode, watchedNode;
     final ElectionContext context;
@@ -570,6 +592,8 @@ public class LeaderElector implements Closeable {
       SolrZooKeeper zk = zkClient.getSolrZooKeeper();
       try {
         zk.removeWatches(watchedNode, this, WatcherType.Any, true);
+      } catch (KeeperException.NoWatcherException e) {
+
       } catch (Exception e) {
         log.info("could not remove watch {} {}", e.getClass().getSimpleName(), e.getMessage());
       }
diff --git a/solr/core/src/java/org/apache/solr/cloud/Overseer.java b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
index 6dac590..6a05079 100644
--- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java
+++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
@@ -30,6 +30,7 @@ import org.apache.solr.cloud.overseer.OverseerAction;
 import org.apache.solr.cloud.overseer.ZkStateWriter;
 import org.apache.solr.common.AlreadyClosedException;
 import org.apache.solr.common.ParWork;
+import org.apache.solr.common.ParWorkExecutor;
 import org.apache.solr.common.SolrCloseable;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrThread;
@@ -45,15 +46,16 @@ import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.ObjectReleaseTracker;
 import org.apache.solr.common.util.Pair;
+import org.apache.solr.common.util.SysStats;
 import org.apache.solr.core.CloudConfig;
 import org.apache.solr.core.CoreContainer;
 import org.apache.solr.handler.admin.CollectionsHandler;
 import org.apache.solr.logging.MDCLoggingContext;
 import org.apache.solr.update.UpdateShardHandler;
+import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
-import org.apache.zookeeper.data.Stat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -71,7 +73,8 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
-import java.util.concurrent.SynchronousQueue;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.locks.ReentrantLock;
 import java.util.function.BiConsumer;
 
@@ -158,7 +161,9 @@ public class Overseer implements SolrCloseable {
   private volatile boolean initedHttpClient = false;
   private volatile QueueWatcher queueWatcher;
   private volatile WorkQueueWatcher.CollectionWorkQueueWatcher collectionQueueWatcher;
-  private volatile ExecutorService taskExecutor;
+  private volatile ParWorkExecutor taskExecutor;
+
+  private volatile ParWorkExecutor zkWriterExecutor;
 
   public boolean isDone() {
     return closeAndDone;
@@ -168,6 +173,10 @@ public class Overseer implements SolrCloseable {
     return taskExecutor;
   }
 
+  public ExecutorService getTaskZkWriterExecutor() {
+    return zkWriterExecutor;
+  }
+
   private static class StringBiConsumer implements BiConsumer<String, Object> {
     boolean firstPair = true;
 
@@ -277,22 +286,22 @@ public class Overseer implements SolrCloseable {
 
   //  doClose();
 
-    MDCLoggingContext.setNode(zkController == null ?
-        null :
-        zkController.getNodeName());
-
     this.id = id;
 //
 //     stateManagmentExecutor = ParWork.getParExecutorService("stateManagmentExecutor",
 //        1, 1, 3000, new SynchronousQueue());
-     taskExecutor = ParWork.getParExecutorService("overseerTaskExecutor",
-        3, 32, 1000, new SynchronousQueue());
+     taskExecutor = (ParWorkExecutor) ParWork.getParExecutorService("overseerTaskExecutor",
+         4, SysStats.PROC_COUNT, 1000, new LinkedBlockingQueue<>(1024));
+    for (int i = 0; i < 4; i++) {
+      taskExecutor.submit(() -> {});
+    }
+
+    zkWriterExecutor = (ParWorkExecutor) ParWork.getParExecutorService("overseerZkWriterExecutor",
+        4, SysStats.PROC_COUNT, 1000, new LinkedBlockingQueue<>(1024));
+    for (int i = 0; i < 4; i++) {
+      zkWriterExecutor.submit(() -> {});
+    }
 
-//    try {
-//      if (context != null) context.close();
-//    } catch (Exception e) {
-//      log.error("", e);
-//    }
     if (overseerOnlyClient == null && !closeAndDone && !initedHttpClient) {
       overseerOnlyClient = new Http2SolrClient.Builder().idleTimeout(60000).connectionTimeout(5000).markInternalRequest().build();
       overseerOnlyClient.enableCloseLock();
@@ -336,7 +345,7 @@ public class Overseer implements SolrCloseable {
     ThreadGroup ccTg = new ThreadGroup("Overseer collection creation process.");
 
 
-    this.zkStateWriter = new ZkStateWriter(zkController.getZkStateReader(), stats);
+    this.zkStateWriter = new ZkStateWriter(zkController.getZkStateReader(), stats, this);
     //systemCollectionCompatCheck(new StringBiConsumer());
 
     queueWatcher = new WorkQueueWatcher(getCoreContainer());
@@ -489,22 +498,9 @@ public class Overseer implements SolrCloseable {
 
     boolean cd = closeAndDone;
 
-    if (cd) {
-      if (taskExecutor != null) {
-        taskExecutor.shutdown();
-      }
-    }
-
     OUR_JVM_OVERSEER = null;
     closed = true;
 
-    if (queueWatcher != null) {
-      queueWatcher.close();
-    }
-
-    if (collectionQueueWatcher != null) {
-      collectionQueueWatcher.close();
-    }
 
     if (!cd) {
       boolean retry;
@@ -519,8 +515,23 @@ public class Overseer implements SolrCloseable {
     }
 
     if (cd) {
-      if (taskExecutor != null && !taskExecutor.isShutdown()) {
+
+      if (taskExecutor != null) {
         taskExecutor.shutdown();
+        try {
+          taskExecutor.awaitTermination(10, TimeUnit.SECONDS);
+        } catch (InterruptedException e) {
+
+        }
+      }
+
+      if (zkWriterExecutor != null) {
+        zkWriterExecutor.shutdown();
+        try {
+          zkWriterExecutor.awaitTermination(10, TimeUnit.SECONDS);
+        } catch (InterruptedException e) {
+
+        }
       }
 
       if (overseerOnlyClient != null) {
@@ -537,8 +548,12 @@ public class Overseer implements SolrCloseable {
         overseerOnlyClient = null;
       }
 
-      if (taskExecutor != null) {
-        taskExecutor.shutdownNow();
+      if (queueWatcher != null) {
+        queueWatcher.close();
+      }
+
+      if (collectionQueueWatcher != null) {
+        collectionQueueWatcher.close();
       }
 
     }
@@ -754,8 +769,10 @@ public class Overseer implements SolrCloseable {
         if (log.isDebugEnabled()) log.debug("set watch on Overseer work queue {}", path);
 
         List<String> children = zkController.getZkClient().getChildren(path, this, true);
-        Collections.sort(children);
-        return children;
+
+        List<String> items = new ArrayList<>(children);
+        Collections.sort(items);
+        return items;
       } catch (KeeperException.SessionExpiredException e) {
         log.warn("ZooKeeper session expired");
         overseer.close();
@@ -795,6 +812,8 @@ public class Overseer implements SolrCloseable {
           if (items.size() > 0) {
             processQueueItems(items, false);
           }
+        } catch (AlreadyClosedException e) {
+
         } catch (Exception e) {
           log.error("Exception during overseer queue queue processing", e);
         }
@@ -813,6 +832,8 @@ public class Overseer implements SolrCloseable {
         this.closed = true;
         try {
           zkController.getZkClient().getSolrZooKeeper().removeWatches(path, this, WatcherType.Data, true);
+        } catch (KeeperException.NoWatcherException e) {
+
         } catch (Exception e) {
           log.info("could not remove watch {} {}", e.getClass().getSimpleName(), e.getMessage());
         }
@@ -847,14 +868,18 @@ public class Overseer implements SolrCloseable {
           final ZkNodeProps message = ZkNodeProps.load(item);
           try {
             boolean success = overseer.processQueueItem(message);
-          } catch (InterruptedException e) {
-            log.error("Overseer state update queue processing interrupted");
-            return;
+          } catch (Exception e) {
+            log.error("Overseer state update queue processing failed", e);
           }
         }
 
         overseer.writePendingUpdates();
-        zkController.getZkClient().delete(fullPaths, true);
+
+        try {
+          zkController.getZkClient().delete(fullPaths, true);
+        } catch (Exception e) {
+          log.error("Failed deleting processed items", e);
+        }
 
 
       } finally {
@@ -905,7 +930,24 @@ public class Overseer implements SolrCloseable {
 
           Map<String,byte[]> data = zkController.getZkClient().getData(fullPaths);
 
-          overseer.getTaskExecutor().submit(() -> {
+          try {
+            zkController.getZkClient().delete(fullPaths, true);
+          } catch (Exception e) {
+            log.warn("Delete items failed {}", e.getMessage());
+          }
+
+          try {
+            log.info("items in queue {} after delete {} {}", path, zkController.getZkClient().listZnode(path, false));
+          } catch (KeeperException e) {
+            log.warn("Check items failed {}", e.getMessage());
+          } catch (InterruptedException e) {
+            log.warn("Check items failed {}", e.getMessage());
+          } catch (SolrServerException e) {
+            log.warn("Check items failed {}", e.getMessage());
+          }
+
+          overseer.getTaskZkWriterExecutor().submit(() -> {
+            MDCLoggingContext.setNode(zkController.getNodeName());
             try {
               runAsync(items, fullPaths, data, onStart);
             } catch (Exception e) {
@@ -925,20 +967,19 @@ public class Overseer implements SolrCloseable {
           throw new AlreadyClosedException();
         }
 
-        try (ParWork work = new ParWork(this, false, true)) {
+        try (ParWork work = new ParWork(this, false, false)) {
           for (Map.Entry<String,byte[]> entry : data.entrySet()) {
             work.collect("", ()->{
               try {
                 byte[] item = entry.getValue();
                 if (item == null) {
                   log.error("empty item {}", entry.getKey());
-                  zkController.getZkClient().delete(entry.getKey(), -1);
                   return;
                 }
+
                 String responsePath = Overseer.OVERSEER_COLLECTION_MAP_COMPLETED + "/" + OverseerTaskQueue.RESPONSE_PREFIX + entry.getKey().substring(entry.getKey().lastIndexOf("-") + 1);
 
                 final ZkNodeProps message = ZkNodeProps.load(item);
-                zkController.getZkClient().delete(entry.getKey(), -1);
                 try {
                   String operation = message.getStr(Overseer.QUEUE_OPERATION);
 
@@ -979,13 +1020,6 @@ public class Overseer implements SolrCloseable {
                     response = collMessageHandler.processMessage(message, operation, zkWriter);
                   }
 
-                  //          try {
-                  //            overseer.writePendingUpdates();
-                  //          } catch (InterruptedException e) {
-                  //            log.error("Overseer state update queue processing interrupted");
-                  //            return;
-                  //          }
-
                   if (log.isDebugEnabled()) log.debug("response {}", response);
 
                   if (response == null) {
@@ -1001,7 +1035,7 @@ public class Overseer implements SolrCloseable {
                     if (log.isDebugEnabled()) {
                       log.debug("Updated completed map for task with zkid:[{}]", asyncId);
                     }
-                    completedMap.put(asyncId, OverseerSolrResponseSerializer.serialize(response));
+                    completedMap.put(asyncId, OverseerSolrResponseSerializer.serialize(response), CreateMode.PERSISTENT);
 
                   } else {
                     byte[] sdata = OverseerSolrResponseSerializer.serialize(response);
@@ -1009,9 +1043,8 @@ public class Overseer implements SolrCloseable {
                     log.info("Completed task:[{}] {} {}", message, response.getResponse(), responsePath);
                   }
 
-                } catch (InterruptedException e) {
-                  log.error("Overseer state update queue processing interrupted");
-                  return;
+                } catch (Exception e) {
+                  log.error("Exception processing entry");
                 }
 
               } catch (Exception e) {
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
index 43feae3..04a4d43 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
@@ -17,6 +17,7 @@
 
 package org.apache.solr.cloud;
 
+import org.apache.solr.common.AlreadyClosedException;
 import org.apache.solr.common.ParWork;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.SolrZkClient;
@@ -37,7 +38,7 @@ final class OverseerElectionContext extends ShardLeaderElectionContextBase {
   private final Overseer overseer;
 
   public OverseerElectionContext(final String zkNodeName, SolrZkClient zkClient, Overseer overseer) {
-    super(zkNodeName, Overseer.OVERSEER_ELECT, Overseer.OVERSEER_ELECT + "/leader", new Replica("overseer:" + overseer.getZkController().getNodeName(), getIDMap(zkNodeName, overseer), null, null, overseer.getZkStateReader()), zkClient);
+    super(zkNodeName, Overseer.OVERSEER_ELECT, Overseer.OVERSEER_ELECT + "/leader", new Replica("overseer:" + overseer.getZkController().getNodeName(), getIDMap(zkNodeName, overseer), null, null, overseer.getZkStateReader()), null, zkClient);
     this.overseer = overseer;
     this.zkClient = zkClient;
   }
@@ -56,7 +57,7 @@ final class OverseerElectionContext extends ShardLeaderElectionContextBase {
 
     if (overseer.isDone()) {
       log.info("Already closed, bailing ...");
-      return false;
+      throw new AlreadyClosedException();
     }
 
     // TODO: the idea here is that we could clear the Overseer queue
@@ -76,12 +77,14 @@ final class OverseerElectionContext extends ShardLeaderElectionContextBase {
 //      clearQueue(Overseer.getInternalWorkQueue(zkClient, new Stats()));
 //    }
 
-
-    super.runLeaderProcess(context, weAreReplacement, pauseBeforeStartMs);
+    boolean success = super.runLeaderProcess(context, weAreReplacement, pauseBeforeStartMs);
+    if (!success) {
+      return false;
+    }
 
     if (!overseer.getZkController().getCoreContainer().isShutDown() && !overseer.getZkController().isShudownCalled()
         && !overseer.isDone()) {
-      log.info("Starting overseer after winnning Overseer election {}", id);
+      log.info("Starting overseer after winning Overseer election {}", id);
       overseer.start(id, context);
     } else {
       log.info("Will not start Overseer because we are closed");
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
index 9adc30d..06c4082 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
@@ -28,6 +28,7 @@ import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.common.util.StrUtils;
 import org.apache.solr.common.util.Utils;
 import org.apache.solr.core.CoreContainer;
+import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.data.Stat;
 import org.slf4j.Logger;
@@ -44,7 +45,6 @@ import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentSkipListMap;
 import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.function.Predicate;
 
@@ -258,7 +258,7 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
 
     if (asyncId != null) {
       log.info("Add async task {} to running map", asyncId);
-      runningMap.put(asyncId, null);
+      runningMap.put(asyncId, null, CreateMode.PERSISTENT);
     }
   }
 
@@ -307,12 +307,12 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
             if (log.isDebugEnabled()) {
               log.debug("Updated failed map for task with id:[{}]", asyncId);
             }
-            failureMap.put(asyncId, OverseerSolrResponseSerializer.serialize(response));
+            failureMap.put(asyncId, OverseerSolrResponseSerializer.serialize(response), CreateMode.PERSISTENT);
           } else {
             if (log.isDebugEnabled()) {
               log.debug("Updated completed map for task with zkid:[{}]", asyncId);
             }
-            completedMap.put(asyncId, OverseerSolrResponseSerializer.serialize(response));
+            completedMap.put(asyncId, OverseerSolrResponseSerializer.serialize(response), CreateMode.PERSISTENT);
 
           }
         } else {
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskQueue.java b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskQueue.java
index 778336a..8ddc86c 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskQueue.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskQueue.java
@@ -226,6 +226,8 @@ public class OverseerTaskQueue extends ZkDistributedQueue {
       this.closed = true;
       try {
         zkClient.getSolrZooKeeper().removeWatches(path, this, WatcherType.Data, true);
+      }  catch (KeeperException.NoWatcherException e) {
+
       } catch (Exception e) {
         log.info("could not remove watch {} {}", e.getClass().getSimpleName(), e.getMessage());
       }
@@ -291,7 +293,7 @@ public class OverseerTaskQueue extends ZkDistributedQueue {
       if (log.isDebugEnabled()) log.debug("get data from response node {} {} {}", watchID, bytes == null ? null : bytes.length, watcher.getWatchedEvent());
 
       if (bytes == null || bytes.length == 0) {
-        log.error("Found no data at response node {}", watchID);
+        log.error("Found no data at response node, Overseer likely changed {}", watchID);
       }
       // create the event before deleting the node, otherwise we can get the deleted
       // event from the watcher.
@@ -307,13 +309,13 @@ public class OverseerTaskQueue extends ZkDistributedQueue {
 
   String createRequestNode(byte[] data, String watchID) throws KeeperException, InterruptedException {
     return createData(dir + "/" + PREFIX + watchID.substring(watchID.lastIndexOf("-") + 1),
-        data, CreateMode.PERSISTENT);
+        data, CreateMode.EPHEMERAL);
   }
 
   String createResponseNode() throws KeeperException, InterruptedException {
     return createData(
         Overseer.OVERSEER_COLLECTION_MAP_COMPLETED + "/" + RESPONSE_PREFIX,
-        null, CreateMode.PERSISTENT_SEQUENTIAL);
+        null, CreateMode.EPHEMERAL_SEQUENTIAL);
   }
 
   private static void printQueueEventsListElementIds(ArrayList<QueueEvent> topN) {
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveringCoreTermWatcher.java b/solr/core/src/java/org/apache/solr/cloud/RecoveringCoreTermWatcher.java
index 3a4d371..74dc19e 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveringCoreTermWatcher.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveringCoreTermWatcher.java
@@ -51,7 +51,7 @@ public class RecoveringCoreTermWatcher implements ZkShardTerms.CoreTermWatcher,
   @Override
   public boolean onTermChanged(ShardTerms terms) {
     if (coreContainer.isShutDown()) return false;
-    MDCLoggingContext.setCoreDescriptor(coreContainer, coreDescriptor);
+    MDCLoggingContext.setCoreName(coreDescriptor.getName());
 
     try {
       if (closed) {
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index eb377d6..32c73ec 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -56,11 +56,9 @@ import org.apache.solr.util.plugin.NamedListInitializedPlugin;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import static org.apache.solr.common.cloud.ZkStateReader.COLLECTIONS_ZKNODE;
 import java.io.Closeable;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.concurrent.CountDownLatch;
@@ -128,7 +126,6 @@ public class RecoveryStrategy implements Runnable, Closeable {
   private boolean recoveringAfterStartup;
   private volatile Cancellable prevSendPreRecoveryHttpUriRequest;
   private volatile Replica.Type replicaType;
-  private volatile CoreDescriptor coreDescriptor;
 
   private final CoreContainer cc;
 
@@ -225,15 +222,15 @@ public class RecoveryStrategy implements Runnable, Closeable {
     return leaderprops.getCoreUrl();
   }
 
-  final private IndexFetcher.IndexFetchResult replicate(Replica leaderprops)
+  final private IndexFetcher.IndexFetchResult replicate(Replica leader)
       throws SolrServerException, IOException {
 
-    log.info("Attempting to replicate from [{}].", leaderprops);
-
-    final String leaderUrl = getReplicateLeaderUrl(leaderprops, zkStateReader);
+    log.info("Attempting to replicate from [{}].", leader);
 
+    String leaderUrl;
     // send commit
     try {
+      leaderUrl = leader.getCoreUrl();
       commitOnLeader(leaderUrl);
     } catch (Exception e) {
       log.error("Commit on leader failed", e);
@@ -314,14 +311,14 @@ public class RecoveryStrategy implements Runnable, Closeable {
           throw new AlreadyClosedException("SolrCore is null, won't do recovery");
         }
 
-        coreDescriptor = core.getCoreDescriptor();
+        CoreDescriptor coreDescriptor = core.getCoreDescriptor();
         replicaType = coreDescriptor.getCloudDescriptor().getReplicaType();
 
         recoveryOnlyClient = core.getCoreContainer().getUpdateShardHandler().getRecoveryOnlyClient();
         SolrRequestHandler handler = core.getRequestHandler(ReplicationHandler.PATH);
         replicationHandler = (ReplicationHandler) handler;
 
-        doRecovery(core);
+        doRecovery(core, coreDescriptor);
       }
     } catch (InterruptedException e) {
       log.info("InterruptedException, won't do recovery", e);
@@ -336,9 +333,9 @@ public class RecoveryStrategy implements Runnable, Closeable {
     }
   }
 
-  final public void doRecovery(SolrCore core) throws Exception {
+  final public void doRecovery(SolrCore core, CoreDescriptor coreDescriptor) throws Exception {
     int tries = 0;
-    while (!isClosed()) {
+    while (!isClosed() && !core.isClosing() && !core.isClosed()) {
       tries++;
       try {
         try {
@@ -350,7 +347,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
           // expected
         }
 
-        Replica leader = zkController.getZkStateReader().getLeaderRetry(coreDescriptor.getCollectionName(), coreDescriptor.getCloudDescriptor().getShardId(), 1500, false);
+        Replica leader = zkController.getZkStateReader().getLeaderRetry(coreDescriptor.getCollectionName(), coreDescriptor.getCloudDescriptor().getShardId(), 3000, false);
 
         if (leader != null && leader.getName().equals(coreName)) {
           log.info("We are the leader, STOP recovery");
@@ -363,12 +360,12 @@ public class RecoveryStrategy implements Runnable, Closeable {
           return;
         }
         boolean successfulRecovery;
-        if (this.coreDescriptor.getCloudDescriptor().requiresTransactionLog()) {
+        if (coreDescriptor.getCloudDescriptor().requiresTransactionLog()) {
           if (log.isDebugEnabled()) log.debug("Sync or replica recovery");
-          successfulRecovery = doSyncOrReplicateRecovery(core);
+          successfulRecovery = doSyncOrReplicateRecovery(core, leader);
         } else {
           if (log.isDebugEnabled()) log.debug("Replicate only recovery");
-          successfulRecovery = doReplicateOnlyRecovery(core);
+          successfulRecovery = doReplicateOnlyRecovery(core, leader);
         }
 
         if (successfulRecovery) {
@@ -384,7 +381,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
     }
   }
 
-  final private boolean doReplicateOnlyRecovery(SolrCore core) throws Exception {
+  final private boolean doReplicateOnlyRecovery(SolrCore core, Replica leader) throws Exception {
     boolean successfulRecovery = false;
 
     // if (core.getUpdateHandler().getUpdateLog() != null) {
@@ -394,18 +391,19 @@ public class RecoveryStrategy implements Runnable, Closeable {
     // return;
     // }
 
-    log.info("Publishing state of core [{}] as recovering {}", coreName, "doReplicateOnlyRecovery");
-
-    zkController.publish(this.coreDescriptor, Replica.State.RECOVERING);
-
-    while (!successfulRecovery && !isClosed()) { // don't use interruption or
+    int cnt = 0;
+    while (!successfulRecovery && !isClosed() && !core.isClosing() && !core.isClosed()) { // don't use interruption or
       // it will close channels
       // though
+      cnt++;
       try {
-        CloudDescriptor cloudDesc = this.coreDescriptor.getCloudDescriptor();
-        Replica leader;
+        CoreDescriptor coreDescriptor = core.getCoreDescriptor();
+        CloudDescriptor cloudDesc = coreDescriptor.getCloudDescriptor();
+
         try {
-          leader = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId(), 1500, false);
+          if (cnt > 1) {
+            leader = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId(), 3000, false);
+          }
 
           if (leader != null && leader.getName().equals(coreName)) {
             log.info("We are the leader, STOP recovery");
@@ -450,7 +448,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
           zkController.startReplicationFromLeader(coreName, false);
           log.info("Registering as Active after recovery.");
           try {
-            zkController.publish(this.coreDescriptor, Replica.State.ACTIVE);
+            zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
           } catch (Exception e) {
             log.error("Could not publish as ACTIVE after succesful recovery", e);
             successfulRecovery = false;
@@ -474,7 +472,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
             close = true;
             log.error("Recovery failed - max retries exceeded (" + retries + ").");
             try {
-              recoveryFailed(zkController, baseUrl, this.coreDescriptor);
+              recoveryFailed(zkController, baseUrl, core.getCoreDescriptor());
             } catch (InterruptedException e) {
 
             } catch (Exception e) {
@@ -487,7 +485,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
       }
 
       if (!successfulRecovery) {
-        waitForRetry();
+        waitForRetry(core);
       } else {
         break;
       }
@@ -503,19 +501,8 @@ public class RecoveryStrategy implements Runnable, Closeable {
   }
 
   // TODO: perhaps make this grab a new core each time through the loop to handle core reloads?
-  public final boolean doSyncOrReplicateRecovery(SolrCore core) throws Exception {
-    log.info("Do peersync or replication recovery core={} collection={}", coreName, coreDescriptor.getCollectionName());
-
-    Replica leader = zkController.getZkStateReader().getLeaderRetry(coreDescriptor.getCollectionName(), coreDescriptor.getCloudDescriptor().getShardId(), 1500);
-    if (leader != null && leader.getName().equals(coreName)) {
-      log.info("We are the leader, STOP recovery");
-      close = true;
-      throw new AlreadyClosedException();
-    }
-
-    log.info("Publishing state of core [{}] as recovering {}", coreName, "doSyncOrReplicateRecovery");
-
-    zkController.publish(this.coreDescriptor, Replica.State.RECOVERING);
+  public final boolean doSyncOrReplicateRecovery(SolrCore core, Replica leader) throws Exception {
+    log.info("Do peersync or replication recovery core={} collection={}", coreName, core.getCoreDescriptor().getCollectionName());
 
     boolean successfulRecovery = false;
     boolean publishedActive = false;
@@ -525,7 +512,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
     if (ulog == null) {
       SolrException.log(log, "No UpdateLog found - cannot recover.");
       close = true;
-      recoveryFailed(zkController, baseUrl, this.coreDescriptor);
+      recoveryFailed(zkController, baseUrl, core.getCoreDescriptor());
       return false;
     }
 
@@ -537,12 +524,12 @@ public class RecoveryStrategy implements Runnable, Closeable {
       recentVersions = recentUpdates.getVersions(ulog.getNumRecordsToKeep());
     } catch (Exception e) {
       log.error("Corrupt tlog - ignoring.", e);
-      recentVersions = new ArrayList<>(0);
+      recentVersions = null;
     }
 
     List<Long> startingVersions = ulog.getStartingVersions();
 
-    if (startingVersions != null && recoveringAfterStartup) {
+    if (startingVersions != null && recentVersions != null && recoveringAfterStartup) {
       try {
         int oldIdx = 0; // index of the start of the old list in the current list
         long firstStartingVersion = startingVersions.size() > 0 ? startingVersions.get(0) : 0;
@@ -597,33 +584,26 @@ public class RecoveryStrategy implements Runnable, Closeable {
       zkController.stopReplicationFromLeader(coreName);
     }
 
-    Future<RecoveryInfo> replayFuture = null;
+    log.info("Publishing state of core [{}] as buffering {}", coreName, "doSyncOrReplicateRecovery");
+
+    zkController.publish(core.getCoreDescriptor(), Replica.State.BUFFERING);
 
-    while (!successfulRecovery && !isClosed()) {
+    Future<RecoveryInfo> replayFuture = null;
+    int cnt = 0;
+    while (!successfulRecovery && !isClosed() && !core.isClosing() && !core.isClosed()) {
+      cnt++;
       try {
-        CloudDescriptor cloudDesc = this.coreDescriptor.getCloudDescriptor();
-        leader = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId(), 1500);
+        CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
 
+        if (cnt > 1) {
+          leader = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId(), 3000, false);
+        }
         if (leader != null && leader.getName().equals(coreName)) {
           log.info("We are the leader, STOP recovery");
           close = true;
           return false;
         }
 
-        log.info("Begin buffering updates. core=[{}]", coreName);
-        // recalling buffer updates will drop the old buffer tlog
-        ulog.bufferUpdates();
-
-//        try {
-//          if (prevSendPreRecoveryHttpUriRequest != null) {
-//            prevSendPreRecoveryHttpUriRequest.cancel();
-//          }
-//        } catch (NullPointerException e) {
-//          // okay
-//        }
-       // TODO can we do this with commit on leader
-        sendPrepRecoveryCmd(leader.getBaseUrl(), leader.getName(), zkStateReader.getClusterState().getCollection(coreDescriptor.getCollectionName()).getSlice(cloudDesc.getShardId()));
-
         // we wait a bit so that any updates on the leader
         // that started before they saw recovering state
         // are sure to have finished (see SOLR-7141 for
@@ -684,6 +664,21 @@ public class RecoveryStrategy implements Runnable, Closeable {
 
           try {
 
+            log.info("Begin buffering updates. core=[{}]", coreName);
+            // recalling buffer updates will drop the old buffer tlog
+            ulog.bufferUpdates();
+
+            //        try {
+            //          if (prevSendPreRecoveryHttpUriRequest != null) {
+            //            prevSendPreRecoveryHttpUriRequest.cancel();
+            //          }
+            //        } catch (NullPointerException e) {
+            //          // okay
+            //        }
+
+            sendPrepRecoveryCmd(leader.getBaseUrl(), leader.getName(), zkStateReader.getClusterState().
+                getCollection(core.getCoreDescriptor().getCollectionName()).getSlice(cloudDesc.getShardId()), core.getCoreDescriptor());
+
             IndexFetcher.IndexFetchResult result = replicate(leader);
 
             if (result.getSuccessful()) {
@@ -730,7 +725,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
               }
             }
 
-            zkController.publish(this.coreDescriptor, Replica.State.ACTIVE);
+            zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
             publishedActive = true;
             close = true;
 
@@ -741,7 +736,6 @@ public class RecoveryStrategy implements Runnable, Closeable {
           } catch (Exception e) {
             log.error("Could not publish as ACTIVE after successful recovery", e);
             successfulRecovery = false;
-            close = false;
           }
 
 
@@ -766,7 +760,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
             SolrException.log(log, "Recovery failed - max retries exceeded (" + retries + ").");
             close = true;
             try {
-              recoveryFailed(zkController, baseUrl, this.coreDescriptor);
+              recoveryFailed(zkController, baseUrl, core.getCoreDescriptor());
             } catch (InterruptedException e) {
 
             } catch (Exception e) {
@@ -778,8 +772,8 @@ public class RecoveryStrategy implements Runnable, Closeable {
         }
       }
 
-      if (!successfulRecovery && !isClosed()) {
-        waitForRetry();
+      if (!successfulRecovery && !isClosed() && !core.isClosing() && !core.isClosed()) {
+        waitForRetry(core);
       } else if (successfulRecovery) {
         break;
       }
@@ -799,7 +793,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
     return successfulRecovery;
   }
 
-  private final void waitForRetry() {
+  private final void waitForRetry(SolrCore core) {
     try {
       if (close) throw new AlreadyClosedException();
       long wait = startingRecoveryDelayMilliSeconds;
@@ -816,7 +810,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
 
       TimeOut timeout = new TimeOut(wait, TimeUnit.MILLISECONDS, TimeSource.NANO_TIME);
       while (!timeout.hasTimedOut()) {
-        if (isClosed()) {
+        if (isClosed() && !core.isClosing() && !core.isClosed()) {
           log.info("RecoveryStrategy has been closed");
           return;
         }
@@ -904,7 +898,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
     return close || cc.isShutDown();
   }
 
-  final private void sendPrepRecoveryCmd(String leaderBaseUrl, String leaderCoreName, Slice slice)
+  final private void sendPrepRecoveryCmd(String leaderBaseUrl, String leaderCoreName, Slice slice, CoreDescriptor coreDescriptor)
       throws SolrServerException, IOException {
 
     if (coreDescriptor.getCollectionName() == null) {
@@ -913,15 +907,11 @@ public class RecoveryStrategy implements Runnable, Closeable {
 
     WaitForState prepCmd = new WaitForState();
     prepCmd.setCoreName(coreName);
-    prepCmd.setState(Replica.State.RECOVERING);
+    prepCmd.setState(Replica.State.BUFFERING);
     prepCmd.setCollection(coreDescriptor.getCollectionName());
     prepCmd.setShardId(coreDescriptor.getCloudDescriptor().getShardId());
-    final Slice.State state = slice.getState();
-    if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY && state != Slice.State.RECOVERY_FAILED) {
-      prepCmd.setOnlyIfLeaderActive(true);
-    }
 
-    log.info("Sending prep recovery command to {} for core {} params={}", leaderBaseUrl, leaderCoreName, prepCmd.getParams());
+    log.info("Sending prep recovery command to {} for leader={} params={}", leaderBaseUrl, leaderCoreName, prepCmd.getParams());
 
     int conflictWaitMs = zkController.getLeaderConflictResolveWait();
     int readTimeout = conflictWaitMs + Integer.parseInt(System.getProperty("prepRecoveryReadTimeoutExtraWait", "10000"));
diff --git a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
index 93cfb38..c78c401 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
@@ -38,7 +38,6 @@ import org.apache.solr.logging.MDCLoggingContext;
 import org.apache.solr.update.PeerSync;
 import org.apache.solr.update.UpdateLog;
 import org.apache.zookeeper.KeeperException;
-import org.apache.zookeeper.KeeperException.SessionExpiredException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -59,10 +58,10 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
 
   public ShardLeaderElectionContext(LeaderElector leaderElector,
                                     final String shardId, final String collection,
-                                    final String coreNodeName, Replica props, ZkController zkController, CoreContainer cc) {
+                                    final String coreNodeName, Replica props, ZkController zkController, CoreContainer cc, CoreDescriptor cd) {
     super(coreNodeName, ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection
                     + "/leader_elect/" + shardId,  ZkStateReader.getShardLeadersPath(
-            collection, shardId), props,
+            collection, shardId), props, cd,
             zkController.getZkClient());
     this.cc = cc;
     this.syncStrategy = new SyncStrategy(cc);
@@ -79,7 +78,7 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
 
   @Override
   public ElectionContext copy() {
-    return new ShardLeaderElectionContext(leaderElector, shardId, collection, id, leaderProps, zkController, cc);
+    return new ShardLeaderElectionContext(leaderElector, shardId, collection, id, leaderProps, zkController, cc, cd);
   }
 
 
@@ -103,19 +102,18 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
         log.error("No SolrCore found, cannot become leader {}", coreName);
         throw new AlreadyClosedException("No SolrCore found, cannot become leader " + coreName);
       }
-      if (core.isClosing() || core.getCoreContainer().isShutDown()) {
-        log.info("We are closed, will not become leader");
-        closed = true;
-        cancelElection();
-        return false;
-      }
+//      if (core.isClosing() || core.getCoreContainer().isShutDown()) {
+//        log.info("We are closed, will not become leader");
+//        closed = true;
+//        cancelElection();
+//        return false;
+//      }
       try {
 
-        core.getSolrCoreState().cancelRecovery(true, false);
+        core.getSolrCoreState().cancelRecovery(false, false);
 
         ActionThrottle lt;
 
-        MDCLoggingContext.setCore(core);
         lt = core.getUpdateHandler().getSolrCoreState().getLeaderThrottle();
 
         lt.minimumWaitBetweenActions();
@@ -138,7 +136,8 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
         replicaType = cloudCd.getReplicaType();
         // should I be leader?
 
-        ZkShardTerms zkShardTerms = zkController.getShardTerms(collection, shardId);
+        if (log.isDebugEnabled()) log.debug("Check zkShardTerms");
+        ZkShardTerms zkShardTerms = zkController.getShardTermsOrNull(collection, shardId);
         try {
           // if the replica is waiting for leader to see recovery state, the leader should refresh its terms
           if (zkShardTerms != null && zkShardTerms.skipSendingUpdatesTo(coreName)) {
@@ -152,7 +151,7 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
           log.error("Exception while looking at refreshing shard terms", e);
         }
         
-        if (zkShardTerms.registered(coreName) && !zkShardTerms.canBecomeLeader(coreName)) {
+        if (zkShardTerms != null && zkShardTerms.registered(coreName) && !zkShardTerms.canBecomeLeader(coreName)) {
           if (!waitForEligibleBecomeLeaderAfterTimeout(zkShardTerms, coreName, leaderVoteWait)) {
             rejoinLeaderElection(core);
             return false;
@@ -166,9 +165,9 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
 
         PeerSync.PeerSyncResult result = null;
         boolean success = false;
-        if (core.getCoreContainer().isShutDown()) {
-          return false;
-        }
+//        if (core.getCoreContainer().isShutDown()) {
+//          return false;
+//        }
         result = syncStrategy.sync(zkController, core, leaderProps, weAreReplacement);
         log.info("Sync strategy sync result {}", result);
         success = result.isSuccess();
@@ -242,11 +241,17 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
         // in case of leaderVoteWait timeout, a replica with lower term can win the election
         if (setTermToMax) {
           log.error("WARNING: Potential data loss -- Replica {} became leader after timeout (leaderVoteWait) " + "without being up-to-date with the previous leader", coreName);
-          zkController.createCollectionTerms(collection);
-          zkController.getShardTerms(collection, shardId).setTermEqualsToLeader(coreName);
+          try {
+            zkController.getShardTerms(collection, shardId).setTermEqualsToLeader(coreName);
+          } catch (Exception e) {
+            log.error("Exception trying to set shard terms equal to leader", e);
+          }
         }
 
-        super.runLeaderProcess(context, weAreReplacement, 0);
+        boolean leaderSuccess = super.runLeaderProcess(context, weAreReplacement, 0);
+        if (!leaderSuccess) {
+          return false;
+        }
 
         ZkNodeProps zkNodes = ZkNodeProps
             .fromKeyVals(Overseer.QUEUE_OPERATION, OverseerAction.STATE.toLower(), ZkStateReader.COLLECTION_PROP, collection, ZkStateReader.CORE_NAME_PROP, leaderProps.getName(),
@@ -264,21 +269,14 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
         throw new SolrException(ErrorCode.SERVER_ERROR, e);
       } catch (Exception e) {
         SolrException.log(log, "There was a problem trying to register as the leader", e);
-        // we could not register ourselves as leader - try and rejoin election
-
-        rejoinLeaderElection(core);
-        return false;
+        throw new SolrException(ErrorCode.SERVER_ERROR, e);
       }
 
-
     } catch (AlreadyClosedException e) {
       log.info("Already closed, won't become leader");
-      closed = true;
-      cancelElection();
       throw e;
-    } finally {
-      MDCLoggingContext.clear();
     }
+
     return true;
   }
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
index 1a3ac74..7e1adf3 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
@@ -33,6 +33,7 @@ import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.Utils;
+import org.apache.solr.core.CoreDescriptor;
 import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.KeeperException.NoNodeException;
@@ -51,8 +52,8 @@ class ShardLeaderElectionContextBase extends ElectionContext {
   protected volatile Integer leaderZkNodeParentVersion;
 
   public ShardLeaderElectionContextBase(final String coreNodeName, String electionPath, String leaderPath,
-                                        Replica props, SolrZkClient zkClient) {
-    super(coreNodeName, electionPath, leaderPath, props);
+                                        Replica props, CoreDescriptor cd,SolrZkClient zkClient) {
+    super(coreNodeName, electionPath, leaderPath, props, cd);
     this.zkClient = zkClient;
   }
 
@@ -213,11 +214,11 @@ class ShardLeaderElectionContextBase extends ElectionContext {
       log.warn("No node exists for election", e);
       throw new AlreadyClosedException("No node exists for election");
     } catch (KeeperException.NodeExistsException e) {
-      log.warn("Node already exists for election", e);
+      log.error("Node already exists for election", e);
 
-      zkClient.delete(leaderPath, -1);
-
-      runLeaderProcess(context, weAreReplacement, pauseBeforeStartMs);
+      return false;
+    } catch (AlreadyClosedException e) {
+      throw e;
     } catch (Throwable t) {
       log.warn("Could not register as the leader because creating the ephemeral registration node in ZooKeeper failed: ", t);
       throw new SolrException(ErrorCode.SERVER_ERROR, "Could not register as the leader because creating the ephemeral registration node in ZooKeeper failed: " + errors, t);
diff --git a/solr/core/src/java/org/apache/solr/cloud/SizeLimitedDistributedMap.java b/solr/core/src/java/org/apache/solr/cloud/SizeLimitedDistributedMap.java
index a0a8391..2a982c0 100644
--- a/solr/core/src/java/org/apache/solr/cloud/SizeLimitedDistributedMap.java
+++ b/solr/core/src/java/org/apache/solr/cloud/SizeLimitedDistributedMap.java
@@ -19,6 +19,7 @@ package org.apache.solr.cloud;
 import java.util.List;
 import org.apache.lucene.util.PriorityQueue;
 import org.apache.solr.common.cloud.SolrZkClient;
+import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.data.Stat;
 
@@ -27,7 +28,7 @@ import org.apache.zookeeper.data.Stat;
  * Oldest znodes (as per modification time) are evicted as newer ones come in.
  *
  * When the map hits the specified maximum size, the oldest <code>maxSize / 10</code> items
- * are evicted on the next {@link #put(String, byte[])} invocation.
+ * are evicted on the next {@link #put(String, byte[], CreateMode)} invocation.
  */
 public class SizeLimitedDistributedMap extends DistributedMap {
 
@@ -49,7 +50,7 @@ public class SizeLimitedDistributedMap extends DistributedMap {
   }
 
   @Override
-  public void put(String trackingId, byte[] data) throws KeeperException, InterruptedException {
+  public void put(String trackingId, byte[] data, CreateMode createMode) throws KeeperException, InterruptedException {
     if (this.size() >= maxSize) {
       // Bring down the size
       List<String> children = zookeeper.getChildren(dir, null, true);
@@ -79,7 +80,7 @@ public class SizeLimitedDistributedMap extends DistributedMap {
       }
     }
 
-    super.put(trackingId, data);
+    super.put(trackingId, data, createMode);
   }
   
   interface OnOverflowObserver {
diff --git a/solr/core/src/java/org/apache/solr/cloud/StatePublisher.java b/solr/core/src/java/org/apache/solr/cloud/StatePublisher.java
index 45d5fda..02750da 100644
--- a/solr/core/src/java/org/apache/solr/cloud/StatePublisher.java
+++ b/solr/core/src/java/org/apache/solr/cloud/StatePublisher.java
@@ -16,21 +16,23 @@
  */
 package org.apache.solr.cloud;
 
+import org.apache.solr.cloud.overseer.OverseerAction;
 import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.Utils;
+import org.apache.solr.core.CoreContainer;
 import org.apache.zookeeper.KeeperException;
-import org.eclipse.jetty.util.BlockingArrayQueue;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.Closeable;
 import java.lang.invoke.MethodHandles;
-import java.util.Collections;
+import java.util.Collection;
 import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
@@ -41,13 +43,14 @@ public class StatePublisher implements Closeable {
 
   private final Map<String,String> stateCache = new ConcurrentHashMap<>(32, 0.75f, 4);
   private final ZkStateReader zkStateReader;
+  private final CoreContainer cc;
 
   public static class NoOpMessage extends ZkNodeProps {
   }
 
   public static final NoOpMessage TERMINATE_OP = new NoOpMessage();
 
-  private final BlockingArrayQueue<ZkNodeProps> workQueue = new BlockingArrayQueue<>(30, 10);
+  private final ArrayBlockingQueue<ZkNodeProps> workQueue = new ArrayBlockingQueue(300, true);
   private final ZkDistributedQueue overseerJobQueue;
   private volatile Worker worker;
   private volatile Future<?> workerFuture;
@@ -67,7 +70,7 @@ public class StatePublisher implements Closeable {
         bulkMessage.getProperties().put("operation", "state");
         try {
           try {
-            message = workQueue.poll(5, TimeUnit.SECONDS);
+            message = workQueue.poll(15, TimeUnit.SECONDS);
           } catch (InterruptedException e) {
 
           }
@@ -106,8 +109,10 @@ public class StatePublisher implements Closeable {
     }
 
     private void bulkMessage(ZkNodeProps zkNodeProps, ZkNodeProps bulkMessage) throws KeeperException, InterruptedException {
-      if (zkNodeProps.getStr("operation").equals("downnode")) {
-        bulkMessage.getProperties().put("downnode", zkNodeProps.getStr(ZkStateReader.NODE_NAME_PROP));
+      if (OverseerAction.get(zkNodeProps.getStr("operation")) == OverseerAction.DOWNNODE) {
+        bulkMessage.getProperties().put(OverseerAction.DOWNNODE.toLower(), zkNodeProps.getStr(ZkStateReader.NODE_NAME_PROP));
+      } else if (OverseerAction.get(zkNodeProps.getStr("operation")) == OverseerAction.RECOVERYNODE) {
+        bulkMessage.getProperties().put(OverseerAction.RECOVERYNODE.toLower(), zkNodeProps.getStr(ZkStateReader.NODE_NAME_PROP));
       } else {
         String collection = zkNodeProps.getStr(ZkStateReader.COLLECTION_PROP);
         String core = zkNodeProps.getStr(ZkStateReader.CORE_NAME_PROP);
@@ -123,17 +128,14 @@ public class StatePublisher implements Closeable {
     }
 
     private void processMessage(ZkNodeProps message) throws KeeperException, InterruptedException {
-      // do it in a separate thread so that we can be stopped by interrupt without screwing up the ZooKeeper client
-      ParWork.getRootSharedExecutor().invokeAll(Collections.singletonList(() -> {
-        overseerJobQueue.offer(Utils.toJSON(message));
-        return null;
-      }));
+      overseerJobQueue.offer(Utils.toJSON(message));
     }
   }
 
-  public StatePublisher(ZkDistributedQueue overseerJobQueue, ZkStateReader zkStateReader) {
+  public StatePublisher(ZkDistributedQueue overseerJobQueue, ZkStateReader zkStateReader, CoreContainer cc) {
     this.overseerJobQueue = overseerJobQueue;
     this.zkStateReader = zkStateReader;
+    this.cc = cc;
   }
 
   public void submitState(ZkNodeProps stateMessage) {
@@ -159,8 +161,21 @@ public class StatePublisher implements Closeable {
         }
 
         stateCache.put(core, state);
-      } else if (operation.equalsIgnoreCase("downnode")) {
-        // set all statecache entries for replica to DOWN
+      } else if (operation.equalsIgnoreCase(OverseerAction.DOWNNODE.toLower())) {
+        // set all statecache entries for replica to a state
+
+        Collection<String> coreNames = cc.getAllCoreNames();
+        for (String core : coreNames) {
+          stateCache.put(core, Replica.State.getShortState(Replica.State.DOWN));
+        }
+
+      } else if (operation.equalsIgnoreCase(OverseerAction.RECOVERYNODE.toLower())) {
+        // set all statecache entries for replica to a state
+
+        Collection<String> coreNames = cc.getAllCoreNames();
+        for (String core : coreNames) {
+          stateCache.put(core, Replica.State.getShortState(Replica.State.RECOVERING));
+        }
 
       } else {
         throw new IllegalArgumentException(stateMessage.toString());
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkCollectionTerms.java b/solr/core/src/java/org/apache/solr/cloud/ZkCollectionTerms.java
index 9e6f71e..af71739 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkCollectionTerms.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkCollectionTerms.java
@@ -17,11 +17,9 @@
 
 package org.apache.solr.cloud;
 
-import org.apache.solr.common.AlreadyClosedException;
 import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.common.util.ObjectReleaseTracker;
-import org.apache.solr.core.CoreDescriptor;
 import org.apache.zookeeper.KeeperException;
 
 import java.util.Map;
@@ -63,15 +61,15 @@ class ZkCollectionTerms implements AutoCloseable {
     return terms.get(shardId);
   }
 
-  public void register(String shardId, String coreNodeName) throws Exception {
+  public void register(String shardId, String name) throws Exception {
     if (closed) return;
-    getShard(shardId).registerTerm(coreNodeName);
+    getShard(shardId).registerTerm(name);
   }
 
-  public void remove(String shardId, CoreDescriptor coreDescriptor) throws KeeperException, InterruptedException {
+  public void remove(String shardId, String name) throws KeeperException, InterruptedException {
     ZkShardTerms zterms = getShardOrNull(shardId);
     if (zterms != null) {
-      if (zterms.removeTerm(coreDescriptor)) {
+      if (zterms.removeTermFor(name)) {
         IOUtils.closeQuietly(terms.remove(shardId));
       }
     }
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index f146d1b..4b32f64 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -33,7 +33,6 @@ import org.apache.solr.common.cloud.ConnectionManager;
 import org.apache.solr.common.cloud.DefaultZkACLProvider;
 import org.apache.solr.common.cloud.DefaultZkCredentialsProvider;
 import org.apache.solr.common.cloud.DocCollection;
-import org.apache.solr.common.cloud.DocCollectionWatcher;
 import org.apache.solr.common.cloud.NodesSysPropsCacher;
 import org.apache.solr.common.cloud.OnReconnect;
 import org.apache.solr.common.cloud.Replica;
@@ -108,6 +107,7 @@ import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.Enumeration;
 import java.util.HashMap;
@@ -170,7 +170,33 @@ public class ZkController implements Closeable, Runnable {
 
   @Override
   public void run() {
-    disconnect(false);
+    disconnect(true);
+    Collection<SolrCore> cores = cc.getCores();
+    for (SolrCore core : cores) {
+      CoreDescriptor desc = core.getCoreDescriptor();
+      String collection = desc.getCollectionName();
+      try {
+        zkStateReader.waitForState(collection, 5, TimeUnit.SECONDS, (n, c) -> {
+          if (c != null) {
+            List<Replica> replicas = c.getReplicas();
+            for (Replica replica : replicas) {
+              if (replica.getNodeName().equals(getNodeName())) {
+                if (!replica.getState().equals(Replica.State.DOWN)) {
+                 // log.info("Found state {} {}", replica.getState(), replica.getNodeName());
+                  return false;
+                }
+              }
+            }
+          }
+          return true;
+        });
+      } catch (InterruptedException e) {
+        ParWork.propagateInterrupt(e);
+        return;
+      } catch (TimeoutException e) {
+        log.error("Timeout", e);
+      }
+    }
   }
 
   public boolean isDcCalled() {
@@ -329,15 +355,20 @@ public class ZkController implements Closeable, Runnable {
     }
 
     public Object call() throws Exception {
-      if (log.isInfoEnabled()) {
-        log.info("Registering core {} afterExpiration? {}", descriptor.getName(), afterExpiration);
-      }
+      MDCLoggingContext.setCoreName(descriptor.getName());
+      try {
+        log.info("Registering core with ZK {} afterExpiration? {}", descriptor.getName(), afterExpiration);
 
-      if (zkController.isDcCalled() || zkController.getCoreContainer().isShutDown() || (afterExpiration && !descriptor.getCloudDescriptor().hasRegistered())) {
-        return null;
+        if (zkController.isDcCalled() || zkController.getCoreContainer().isShutDown() || (afterExpiration && !descriptor.getCloudDescriptor().hasRegistered())) {
+          return null;
+        }
+        if (zkController.cc.getAllCoreNames().contains(descriptor.getName())) {
+          zkController.register(descriptor.getName(), descriptor, afterExpiration);
+        }
+        return descriptor;
+      } finally {
+        MDCLoggingContext.clear();
       }
-      zkController.register(descriptor.getName(), descriptor, afterExpiration);
-      return descriptor;
     }
   }
 
@@ -629,7 +660,7 @@ public class ZkController implements Closeable, Runnable {
   }
 
   public void disconnect(boolean publishDown) {
-    if (log.isDebugEnabled()) log.debug("disconnect");
+    log.info("disconnect");
     this.dcCalled = true;
 
     try {
@@ -642,27 +673,33 @@ public class ZkController implements Closeable, Runnable {
       closer.collect("replicateFromLeaders", replicateFromLeaders);
 
       if (publishDown) {
+        closer.collect(leaderElectors);
+
         closer.collect("PublishNodeAsDown&RepFromLeaders", () -> {
           try {
             log.info("Publish this node as DOWN...");
-            publishNodeAsDown(getNodeName());
+            publishNodeAs(getNodeName(), OverseerAction.DOWNNODE);
           } catch (Exception e) {
             ParWork.propagateInterrupt("Error publishing nodes as down. Continuing to close CoreContainer", e);
           }
           return "PublishDown";
         });
-        closer.collect();
       }
+    }
+  }
 
-      closer.collect(leaderElectors);
+  /**
+   * Closes the underlying ZooKeeper client.
+   */
+  public void close() {
+    if (log.isDebugEnabled()) log.debug("Closing ZkController");
+    //assert closeTracker.close();
 
-      closer.collect(overseerElector);
+    this.shudownCalled = true;
 
-      if (overseer != null) {
-        closer.collect("", () -> {
-          overseer.closeAndDone();
-        });
-      }
+    this.isClosed = true;
+    try (ParWork closer = new ParWork(this, true, true)) {
+      closer.collect(leaderElectors);
       closer.collect(sysPropsCacher);
       closer.collect(cloudManager);
       closer.collect(cloudSolrClient);
@@ -677,37 +714,32 @@ public class ZkController implements Closeable, Runnable {
         }
       });
 
-    } finally {
-      leaderElectors.clear();
-    }
-  }
-
-  /**
-   * Closes the underlying ZooKeeper client.
-   */
-  public void close() {
-    if (log.isDebugEnabled()) log.debug("Closing ZkController");
-    //assert closeTracker.close();
+      closer.collect(overseerElector);
 
-    this.shudownCalled = true;
+      if (overseer != null) {
+        closer.collect("", () -> {
+          try {
+            overseer.closeAndDone();
+          } catch (Exception e) {
+            log.warn("Exception closing Overseer", e);
+          }
+        });
+      }
 
-    this.isClosed = true;
-    try (ParWork closer = new ParWork(this, true, true)) {
-      closer.collect(leaderElectors);
       collectionToTerms.forEach((s, zkCollectionTerms) -> closer.collect(zkCollectionTerms));
-    }
-
-    IOUtils.closeQuietly(zkStateReader);
 
-    if (closeZkClient) {
-      zkClient.disableCloseLock();
-      IOUtils.closeQuietly(zkClient);
-    }
+    } finally {
+      IOUtils.closeQuietly(zkStateReader);
 
-    SolrLifcycleListener.removeShutdown(this);
+      if (closeZkClient && zkClient != null) {
+        zkClient.disableCloseLock();
+        IOUtils.closeQuietly(zkClient);
+      }
 
-    assert ObjectReleaseTracker.release(this);
+      SolrLifcycleListener.removeShutdown(this);
 
+      assert ObjectReleaseTracker.release(this);
+    }
   }
 
   /**
@@ -928,10 +960,6 @@ public class ZkController implements Closeable, Runnable {
     paths.put(Overseer.OVERSEER_ASYNC_IDS, null);
     paths.put(Overseer.OVERSEER_ELECT, null);
 
-
-    paths.put("/autoscaling", null);
-    paths.put("/autoscaling/events/.scheduled_maintenance", null);
-    paths.put("/autoscaling/events/.auto_add_replicas", null);
 //
     //   operations.add(zkClient.createPathOp(ZkStateReader.CLUSTER_PROPS, emptyJson));
     paths.put(ZkStateReader.SOLR_PKGS_PATH, null);
@@ -1162,12 +1190,13 @@ public class ZkController implements Closeable, Runnable {
       zkStateReader = new ZkStateReader(zkClient, () -> {
         if (cc != null) cc.securityNodeChanged();
       });
+      zkStateReader.setNode(nodeName);
       zkStateReader.setCollectionRemovedListener(collection -> removeCollectionTerms(collection));
       this.baseURL = zkStateReader.getBaseUrlForNodeName(this.nodeName);
 
       zkStateReader.createClusterStateWatchersAndUpdate();
 
-      statePublisher = new StatePublisher(overseerJobQueue, zkStateReader);
+      statePublisher = new StatePublisher(overseerJobQueue, zkStateReader, cc);
       statePublisher.start();
 
       this.sysPropsCacher = new NodesSysPropsCacher(getSolrCloudManager().getNodeStateProvider(), getNodeName(), zkStateReader);
@@ -1199,7 +1228,7 @@ public class ZkController implements Closeable, Runnable {
         //          });
       //}
 
-      publishDownStates();
+      publishNodeAs(getNodeName(), OverseerAction.RECOVERYNODE);
     } catch (InterruptedException e) {
       ParWork.propagateInterrupt(e);
       throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
@@ -1238,7 +1267,7 @@ public class ZkController implements Closeable, Runnable {
   }
 
   public void publishDownStates() throws KeeperException {
-    publishNodeAsDown(getNodeName());
+    publishNodeAs(getNodeName(), OverseerAction.DOWNNODE);
   }
 
   /**
@@ -1338,11 +1367,11 @@ public class ZkController implements Closeable, Runnable {
    *
    * @return the shardId for the SolrCore
    */
-  private String register(String coreName, final CoreDescriptor desc, boolean afterExpiration) throws Exception {
+  private String register(String coreName, final CoreDescriptor desc, boolean afterExpiration) {
     if (getCoreContainer().isShutDown() || isDcCalled()) {
       throw new AlreadyClosedException();
     }
-    MDCLoggingContext.setCoreDescriptor(cc, desc);
+    MDCLoggingContext.setCoreName(desc.getName());
     ZkShardTerms shardTerms = null;
     LeaderElector leaderElector = null;
     try {
@@ -1356,7 +1385,9 @@ public class ZkController implements Closeable, Runnable {
       AtomicReference<Replica> replicaRef = new AtomicReference<>();
 
       // the watcher is added to a set so multiple calls of this method will left only one watcher
-      getZkStateReader().registerCore(cloudDesc.getCollectionName());
+      if (!cloudDesc.hasRegistered()) {
+        getZkStateReader().registerCore(cloudDesc.getCollectionName());
+      }
 
       try {
         log.info("Waiting to see our entry in state.json {}", desc.getName());
@@ -1377,7 +1408,7 @@ public class ZkController implements Closeable, Runnable {
       }
 
       Replica replica = replicaRef.get();
-      
+
       if (replica == null) {
         replica = zkStateReader.getClusterState().getCollection(collection).getReplica(coreName);
         if (replica == null) {
@@ -1397,50 +1428,49 @@ public class ZkController implements Closeable, Runnable {
 
       log.info("Create leader elector for replica {}", coreName);
       leaderElector = leaderElectors.get(replica.getName());
-      if (leaderElector == null) {
+      if (leaderElector == null && !dcCalled && !cc.isShutDown()) {
+        if (afterExpiration) {
+          throw new AlreadyClosedException();
+        }
         ContextKey contextKey = new ContextKey(collection, coreName);
         leaderElector = new LeaderElector(this, contextKey);
-        LeaderElector oldElector = leaderElectors.put(replica.getName(), leaderElector);
-        IOUtils.closeQuietly(oldElector);
-      }
+        LeaderElector oldElector = leaderElectors.putIfAbsent(replica.getName(), leaderElector);
 
-      //
-      try {
-        // If we're a preferred leader, insert ourselves at the head of the queue
-        boolean joinAtHead = replica.getBool(SliceMutator.PREFERRED_LEADER_PROP, false);
-        if (replica.getType() != Type.PULL) {
-          //getCollectionTerms(collection).register(cloudDesc.getShardId(), coreName);
-          // nocommit review
-          joinElection(desc, joinAtHead);
+        if (oldElector != null) {
+          IOUtils.closeQuietly(leaderElector);
         }
-      } catch (InterruptedException e) {
-        ParWork.propagateInterrupt(e);
-        return null;
-      } catch (KeeperException | IOException e) {
-        throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
+      }
+
+      // If we're a preferred leader, insert ourselves at the head of the queue
+      boolean joinAtHead = replica.getBool(SliceMutator.PREFERRED_LEADER_PROP, false);
+      if (replica.getType() != Type.PULL) {
+        //getCollectionTerms(collection).register(cloudDesc.getShardId(), coreName);
+        // nocommit review
+        joinElection(desc, joinAtHead);
       }
 
       log.info("Wait to see leader for {}, {}", collection, shardId);
       Replica leader = null;
-      for (int i = 0; i < 30; i++) {
-//        if (leaderElector.isLeader()) {
-//          leader = replica;
-//          break;
-//        }
+      for (int i = 0; i < 15; i++) {
+        if (leaderElector.isLeader()) {
+          leader = replica;
+          break;
+        }
 
         try {
-          if (getCoreContainer().isShutDown() || isDcCalled() || isClosed()) {
-            throw new AlreadyClosedException();
-          }
+          //          if (getCoreContainer().isShutDown() || isDcCalled() || isClosed()) {
+          //            throw new AlreadyClosedException();
+          //          }
 
-          leader = zkStateReader.getLeaderRetry(collection, shardId, 500, false);
+          leader = zkStateReader.getLeaderRetry(collection, shardId, 3000, false);
 
         } catch (TimeoutException timeoutException) {
-
+          log.info("Timeout waiting to see leader, retry");
         }
       }
 
       if (leader == null) {
+        log.error("No leader found while trying to register " + coreName + " with zookeeper");
         throw new SolrException(ErrorCode.SERVER_ERROR, "No leader found while trying to register " + coreName + " with zookeeper");
       }
 
@@ -1502,7 +1532,6 @@ public class ZkController implements Closeable, Runnable {
           startReplicationFromLeader(coreName, false);
         }
 
-
         if (replica.getType() != Type.PULL && shardTerms != null) {
           // the watcher is added to a set so multiple calls of this method will left only one watcher
           if (log.isDebugEnabled()) log.debug("add shard terms listener for {}", coreName);
@@ -1519,6 +1548,9 @@ public class ZkController implements Closeable, Runnable {
       desc.getCloudDescriptor().setHasRegistered(true);
 
       return shardId;
+    } catch (Exception e) {
+      log.error("Error registering SolrCore with Zookeeper", e);
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Error registering SolrCore with Zookeeper", e);
     } finally {
       if (isDcCalled() || isClosed()) {
         IOUtils.closeQuietly(leaderElector);
@@ -1565,46 +1597,8 @@ public class ZkController implements Closeable, Runnable {
     log.info("{} stopping background replication from leader", coreName);
     ReplicateFromLeader replicateFromLeader = replicateFromLeaders.remove(coreName);
     if (replicateFromLeader != null) {
-      ParWork.close(replicateFromLeader);
-    }
-  }
-
-  // timeoutms is the timeout for the first call to get the leader - there is then
-  // a longer wait to make sure that leader matches our local state
-  private String getLeader(final CloudDescriptor cloudDesc, int timeoutms) {
-
-    String collection = cloudDesc.getCollectionName();
-    String shardId = cloudDesc.getShardId();
-    // rather than look in the cluster state file, we go straight to the zknodes
-    // here, because on cluster restart there could be stale leader info in the
-    // cluster state node that won't be updated for a moment
-    String leaderUrl;
-    try {
-      leaderUrl = getLeaderProps(collection, cloudDesc.getShardId(), timeoutms)
-              .getCoreUrl();
-
-      zkStateReader.waitForState(collection, timeoutms * 2, TimeUnit.MILLISECONDS, (n, c) -> checkLeaderUrl(cloudDesc, leaderUrl, collection, shardId, leaderConflictResolveWait));
-
-    } catch (Exception e) {
-      ParWork.propagateInterrupt(e);
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Error getting leader from zk", e);
-    }
-    return leaderUrl;
-  }
-
-  private boolean checkLeaderUrl(CloudDescriptor cloudDesc, String leaderUrl, String collection, String shardId,
-                                 int timeoutms) {
-    // now wait until our currently cloud state contains the latest leader
-    String clusterStateLeaderUrl;
-    try {
-      clusterStateLeaderUrl = zkStateReader.getLeaderUrl(collection, shardId, 10000);
-
-      // leaderUrl = getLeaderProps(collection, cloudDesc.getShardId(), timeoutms).getCoreUrl();
-    } catch (Exception e) {
-      ParWork.propagateInterrupt(e);
-      throw new SolrException(ErrorCode.SERVER_ERROR, e);
+      IOUtils.closeQuietly(replicateFromLeader);
     }
-    return clusterStateLeaderUrl != null;
   }
 
   /**
@@ -1656,6 +1650,7 @@ public class ZkController implements Closeable, Runnable {
     Map<String, Object> props = new HashMap<>();
     // we only put a subset of props into the leader node
     props.put(ZkStateReader.NODE_NAME_PROP, getNodeName());
+    props.put(CORE_NAME_PROP, cd.getName());
 
     Replica replica = new Replica(cd.getName(), props, collection, shardId, zkStateReader);
     LeaderElector leaderElector;
@@ -1674,7 +1669,7 @@ public class ZkController implements Closeable, Runnable {
     }
 
     ElectionContext context = new ShardLeaderElectionContext(leaderElector, shardId,
-        collection, cd.getName(), replica, this, cc);
+        collection, cd.getName(), replica, this, cc, cd);
 
 
     leaderElector.setup(context);
@@ -1718,7 +1713,7 @@ public class ZkController implements Closeable, Runnable {
    * Publish core state to overseer.
    */
   public void publish(final CoreDescriptor cd, final Replica.State state, boolean updateLastState) throws Exception {
-    MDCLoggingContext.setCoreDescriptor(cc, cd);
+    MDCLoggingContext.setCoreName(cd.getName());
     try {
       log.info("publishing state={}", state);
       try (SolrCore core = cc.getCore(cd.getName())) {
@@ -1751,9 +1746,6 @@ public class ZkController implements Closeable, Runnable {
       props.put(ZkStateReader.COLLECTION_PROP, collection);
       props.put(ZkStateReader.REPLICA_TYPE, cd.getCloudDescriptor().getReplicaType().toString());
 
-      if (numShards != null) {
-        props.put(ZkStateReader.NUM_SHARDS_PROP, numShards.toString());
-      }
       try (SolrCore core = cc.getCore(cd.getName())) {
         if (core != null && core.getDirectoryFactory().isSharedStorage()) {
           // nocommit
@@ -1838,14 +1830,13 @@ public class ZkController implements Closeable, Runnable {
     collectionToTerms.values().forEach(ZkCollectionTerms::close);
   }
 
-  public void unregister(String coreName, CoreDescriptor cd) throws KeeperException, InterruptedException {
+  public void unregister(String coreName, String collection, String shardId) throws KeeperException, InterruptedException {
     log.info("Unregister core from zookeeper {}", coreName);
-    final String collection = cd.getCloudDescriptor().getCollectionName();
     try {
 
       ZkCollectionTerms ct = collectionToTerms.get(collection);
       if (ct != null) {
-        ct.remove(cd.getCloudDescriptor().getShardId(), cd);
+        ct.remove(shardId, coreName);
       }
 
       replicasMetTragicEvent.remove(collection + ":" + coreName);
@@ -2083,8 +2074,6 @@ public class ZkController implements Closeable, Runnable {
     String electionNode = params.get(ELECTION_NODE_PROP);
 
     try {
-      MDCLoggingContext.setCoreDescriptor(cc, cc.getCoreDescriptor(coreName));
-
       log.info("Rejoin the shard leader election.");
       LeaderElector elect =  leaderElectors.get(coreName);
       if (elect != null) {
@@ -2096,8 +2085,6 @@ public class ZkController implements Closeable, Runnable {
     } catch (Exception e) {
       ParWork.propagateInterrupt(e);
       throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to rejoin election", e);
-    } finally {
-      MDCLoggingContext.clear();
     }
   }
 
@@ -2404,7 +2391,7 @@ public class ZkController implements Closeable, Runnable {
   public OnReconnect getConfigDirListener() {
     return new OnReconnect() {
       @Override
-      public void command() throws SessionExpiredException {
+      public void command() {
           confDirectoryListeners.forEach((s, runnables) -> {
             setConfWatcher(s, new WatcherImpl(s), null);
             fireEventListeners(s);
@@ -2418,60 +2405,6 @@ public class ZkController implements Closeable, Runnable {
     };
   }
 
-  /** @lucene.internal */
-  public class UnloadCoreOnDeletedWatcher implements DocCollectionWatcher {
-    String shard;
-    String coreName;
-
-    public UnloadCoreOnDeletedWatcher(String shard, String coreName) {
-      this.shard = shard;
-      this.coreName = coreName;
-    }
-
-    @Override
-    // synchronized due to SOLR-11535
-    public synchronized boolean onStateChanged(DocCollection collectionState) {
-      if (isClosed()) { // don't accidentally delete cores on shutdown due to unreliable state
-        return true;
-      }
-
-      if (getCoreContainer().getCoreDescriptor(coreName) == null) return true;
-
-      boolean replicaRemoved = getReplicaOrNull(collectionState, shard, coreName) == null;
-      if (replicaRemoved) {
-        try {
-          log.info("Replica {} removed from clusterstate, remove it.", coreName);
-          getCoreContainer().unload(coreName, true, true, true); // nocommit - this causes bad things in tests
-        } catch (SolrException e) {
-          if (!e.getMessage().contains("Cannot unload non-existent core")) {
-            // no need to log if the core was already unloaded
-            log.warn("Failed to unregister core:{}", coreName, e);
-          }
-        } catch (Exception e) {
-          ParWork.propagateInterrupt(e);
-          log.warn("Failed to unregister core:{}", coreName, e);
-        }
-      }
-      return replicaRemoved;
-    }
-
-    @Override
-    public boolean equals(Object o) {
-      if (this == o) return true;
-      if (o == null || getClass() != o.getClass()) return false;
-      UnloadCoreOnDeletedWatcher that = (UnloadCoreOnDeletedWatcher) o;
-      return
-          Objects.equals(shard, that.shard) &&
-          Objects.equals(coreName, that.coreName);
-    }
-
-    @Override
-    public int hashCode() {
-
-      return Objects.hash(shard, coreName);
-    }
-  }
-
   /**
    * Thrown during pre register process if the replica is not present in clusterstate
    */
@@ -2498,7 +2431,7 @@ public class ZkController implements Closeable, Runnable {
    *
    * @param nodeName to operate on
    */
-  public void publishNodeAsDown(String nodeName) throws KeeperException {
+  public void publishNodeAs(String nodeName, OverseerAction state) throws KeeperException {
     log.info("Publish node={} as DOWN", nodeName);
 
     if (overseer == null) {
@@ -2506,7 +2439,7 @@ public class ZkController implements Closeable, Runnable {
       return;
     }
 
-    ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.DOWNNODE.toLower(),
+    ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, state.toLower(),
         ZkStateReader.NODE_NAME_PROP, nodeName);
     try {
       statePublisher.submitState(m);
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkDistributedQueue.java b/solr/core/src/java/org/apache/solr/cloud/ZkDistributedQueue.java
index 7fc8ef0..c3567b2 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkDistributedQueue.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkDistributedQueue.java
@@ -17,14 +17,12 @@
 package org.apache.solr.cloud;
 
 import org.apache.solr.client.solrj.cloud.DistributedQueue;
+import org.apache.solr.cloud.overseer.OverseerAction;
 import org.apache.solr.common.cloud.ConnectionManager.IsClosed;
 import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.Op;
-import org.apache.zookeeper.WatchedEvent;
-import org.apache.zookeeper.Watcher;
-import org.apache.zookeeper.data.Stat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -37,7 +35,6 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.concurrent.CountDownLatch;
 
 /**
  * <p>A ZK-based distributed queue. Optimized for single-consumer,
@@ -126,7 +123,8 @@ public class ZkDistributedQueue implements DistributedQueue {
   static {
     OPERATIONS.add("state");
     OPERATIONS.add("leader");
-    OPERATIONS.add("downnode");
+    OPERATIONS.add(OverseerAction.DOWNNODE.toLower());
+    OPERATIONS.add(OverseerAction.RECOVERYNODE.toLower());
     OPERATIONS.add("updateshardstate");
   }
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkShardTerms.java b/solr/core/src/java/org/apache/solr/cloud/ZkShardTerms.java
index d9eb9a3..debfd8e 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkShardTerms.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkShardTerms.java
@@ -24,7 +24,6 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicReference;
 
@@ -36,7 +35,6 @@ import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.ObjectReleaseTracker;
 import org.apache.solr.common.util.Utils;
-import org.apache.solr.core.CoreDescriptor;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.Watcher;
 import org.apache.zookeeper.data.Stat;
@@ -99,7 +97,7 @@ public class ZkShardTerms implements Closeable {
     void close();
   }
 
-  public ZkShardTerms(String collection, String shard, SolrZkClient zkClient) throws IOException{
+  public ZkShardTerms(String collection, String shard, SolrZkClient zkClient) throws IOException, KeeperException {
     this.znodePath = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/terms/" + shard;
     this.collection = collection;
     this.shard = shard;
@@ -187,13 +185,12 @@ public class ZkShardTerms implements Closeable {
    * Remove the coreNodeName from terms map and also remove any expired listeners
    * @return Return true if this object should not be reused
    */
-  boolean removeTerm(CoreDescriptor cd) throws KeeperException, InterruptedException {
+  boolean removeTermFor(String name) throws KeeperException, InterruptedException {
     int numListeners;
-      // solrcore already closed
     listeners.removeIf(coreTermWatcher -> !coreTermWatcher.onTermChanged(terms.get()));
     numListeners = listeners.size();
 
-    return removeTerm(cd.getName()) || numListeners == 0;
+    return removeTerm(name) || numListeners == 0;
   }
 
   // package private for testing, only used by tests
@@ -223,7 +220,7 @@ public class ZkShardTerms implements Closeable {
    */
   void registerTerm(String coreNodeName) throws KeeperException, InterruptedException {
     ShardTerms newTerms;
-    while ( (newTerms = terms.get().registerTerm(coreNodeName)) != null) {
+    while ((newTerms = terms.get().registerTerm(coreNodeName)) != null) {
       if (forceSaveTerms(newTerms)) break;
     }
   }
@@ -312,7 +309,7 @@ public class ZkShardTerms implements Closeable {
       return saveTerms(newTerms);
     } catch (KeeperException.NoNodeException e) {
       log.error("No node exists in ZK to save terms to", e);
-      return true;
+      throw new AlreadyClosedException();
     }
   }
 
@@ -324,18 +321,20 @@ public class ZkShardTerms implements Closeable {
    */
   private boolean saveTerms(ShardTerms newTerms) throws KeeperException, InterruptedException {
     byte[] znodeData = Utils.toJSON(newTerms);
+
     try {
       Stat stat = zkClient.setData(znodePath, znodeData, newTerms.getVersion(), true);
       ShardTerms newShardTerms = new ShardTerms(newTerms, stat.getVersion());
       setNewTerms(newShardTerms);
-      if (log.isDebugEnabled()) log.debug("Successful update of terms at {} to {}", znodePath, newTerms);
+      log.info("Successful update of terms at {} to {}", znodePath, newTerms);
       return true;
     } catch (KeeperException.BadVersionException e) {
-      log.info("Failed to save terms, version is not a match, retrying version={}", newTerms.getVersion());
-
-      if (isClosed.get()) {
-        throw new AlreadyClosedException();
+      int foundVersion = -1;
+      Stat stat = zkClient.exists(znodePath, null);
+      if (stat != null) {
+        foundVersion = stat.getVersion();
       }
+      log.info("Failed to save terms, version is not a match, retrying version={} found={}", newTerms.getVersion(), foundVersion);
 
       refreshTerms(false);
     }
@@ -347,16 +346,20 @@ public class ZkShardTerms implements Closeable {
    */
   public void refreshTerms(boolean setWatch) throws KeeperException {
     ShardTerms newTerms;
-    try {
-      Watcher watcher = event -> {
-        // session events are not change events, and do not remove the watcher
-        if (Watcher.Event.EventType.None == event.getType()) {
-          return;
-        }
-        if (event.getType() == Watcher.Event.EventType.NodeCreated || event.getType() == Watcher.Event.EventType.NodeDataChanged) {
+    Watcher watcher = event -> {
+      // session events are not change events, and do not remove the watcher
+      if (Watcher.Event.EventType.None == event.getType()) {
+        return;
+      }
+      if (event.getType() == Watcher.Event.EventType.NodeCreated || event.getType() == Watcher.Event.EventType.NodeDataChanged) {
+        try {
           retryRegisterWatcher();
+        } catch (KeeperException e) {
+          log.warn("Exception refreshing terms on watcher event", e);
         }
-      };
+      }
+    };
+    try {
       Stat stat = new Stat();
       byte[] data = zkClient.getData(znodePath, setWatch ? watcher : null, stat, true);
       ConcurrentHashMap<String,Long> values = new ConcurrentHashMap<>((Map<String,Long>) Utils.fromJSON(data));
@@ -364,8 +367,36 @@ public class ZkShardTerms implements Closeable {
       newTerms = new ShardTerms(values, stat.getVersion());
     } catch (KeeperException.NoNodeException e) {
       log.warn("No node found for shard terms", e);
+      if (!isClosed.get()) {
+        try {
+          if (zkClient.exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection)) {
+            try {
+              zkClient.mkdir(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/terms");
+            } catch (KeeperException.NodeExistsException e1) {
+
+            }
+            try {
+              zkClient.mkdir(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/terms/" + shard, ZkStateReader.emptyJson);
+            } catch (KeeperException.NodeExistsException e1) {
+
+            }
+            Stat stat = new Stat();
+            byte[] data = zkClient.getData(znodePath, setWatch ? watcher : null, stat, true);
+            ConcurrentHashMap<String,Long> values = new ConcurrentHashMap<>((Map<String,Long>) Utils.fromJSON(data));
+            if (log.isDebugEnabled()) log.debug("refresh shard terms to zk version {}", stat.getVersion());
+            // nocommit
+            log.info("refresh shard terms to zk version {}", stat.getVersion());
+            newTerms = new ShardTerms(values, stat.getVersion());
+            setNewTerms(newTerms);
+            return;
+          }
+        } catch (InterruptedException interruptedException) {
+          throw new AlreadyClosedException(interruptedException);
+        }
+      }
+
       // we have likely been deleted
-      return;
+      throw new AlreadyClosedException("No node found for shard terms");
     } catch (InterruptedException e) {
       ParWork.propagateInterrupt(e);
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error updating shard term for collection: " + collection, e);
@@ -377,26 +408,8 @@ public class ZkShardTerms implements Closeable {
   /**
    * Retry register a watcher to the correspond ZK term node
    */
-  private void retryRegisterWatcher() {
-    while (!isClosed.get()) {
-      try {
-        refreshTerms(true);
-        return;
-      } catch (KeeperException.AuthFailedException e) {
-        isClosed.set(true);
-        log.error("Failed watching shard term for collection: {} due to unrecoverable exception", collection, e);
-        return;
-      } catch (KeeperException e) {
-        log.warn("Failed watching shard term for collection: {}, retrying!", collection, e);
-        try {
-          zkClient.getConnectionManager().waitForConnected(zkClient.getZkClientTimeout());
-        } catch (TimeoutException | InterruptedException te) {
-          if (Thread.interrupted()) {
-            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error watching shard term for collection: " + collection, te);
-          }
-        }
-      }
-    }
+  private void retryRegisterWatcher() throws KeeperException {
+    refreshTerms(true);
   }
 
   /**
@@ -405,7 +418,10 @@ public class ZkShardTerms implements Closeable {
    */
   private void setNewTerms(ShardTerms newTerms) {
     boolean isChanged = false;
+    int cnt = 0;
     for (;;)  {
+      cnt++;
+      log.info("set new terms {} {}", newTerms, cnt);
       ShardTerms terms = this.terms.get();
       if (terms == null || newTerms.getVersion() > terms.getVersion())  {
         if (this.terms.compareAndSet(terms, newTerms))  {
@@ -421,6 +437,10 @@ public class ZkShardTerms implements Closeable {
   }
 
   private void onTermUpdates(ShardTerms newTerms) {
-    listeners.removeIf(coreTermWatcher -> !coreTermWatcher.onTermChanged(newTerms));
+    try {
+      listeners.removeIf(coreTermWatcher -> !coreTermWatcher.onTermChanged(newTerms));
+    } catch (Exception e) {
+      log.error("Error calling shard term listener", e);
+    }
   }
 }
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkSolrResourceLoader.java b/solr/core/src/java/org/apache/solr/cloud/ZkSolrResourceLoader.java
index 0179abd..fab501e 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkSolrResourceLoader.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkSolrResourceLoader.java
@@ -68,7 +68,7 @@ public class ZkSolrResourceLoader extends SolrResourceLoader implements Resource
   public InputStream openResource(String resource) throws IOException {
 
     String file = (".".equals(resource)) ? configSetZkPath : configSetZkPath + "/" + resource;
-    if (log.isDebugEnabled()) log.debug("open resource {}", resource);
+    if (log.isTraceEnabled()) log.trace("open resource {}", resource);
 
     try {
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java
index 930ffac..7b6858e 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java
@@ -88,6 +88,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.NoSuchElementException;
 import java.util.Properties;
+import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 
@@ -150,9 +151,16 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
     final boolean waitForFinalState = false;
     final String alias = message.getStr(ALIAS, collectionName);
     if (log.isDebugEnabled()) log.debug("Create collection {}", collectionName);
-    if (clusterState.hasCollection(collectionName)) {
+    CountDownLatch latch = new CountDownLatch(1);
+    zkStateReader.getZkClient().getSolrZooKeeper().sync(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collectionName, (rc, path, ctx) -> {
+      latch.countDown();
+    }, null);
+    latch.await(5, TimeUnit.SECONDS);
+
+    if (zkStateReader.getZkClient().exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collectionName)) {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "collection already exists: " + collectionName);
     }
+
     if (aliases.hasAlias(collectionName)) {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "collection alias already exists: " + collectionName);
     }
@@ -255,7 +263,7 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
         ZkNodeProps props = new ZkNodeProps();
         //props.getProperties().putAll(message.getProperties());
         ZkNodeProps addReplicaProps = new ZkNodeProps(Overseer.QUEUE_OPERATION, ADDREPLICA.toString(), ZkStateReader.COLLECTION_PROP, collectionName, ZkStateReader.SHARD_ID_PROP,
-            replicaPosition.shard, ZkStateReader.CORE_NAME_PROP, coreName, ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(), ZkStateReader.NODE_NAME_PROP, nodeName, "node", nodeName,
+            replicaPosition.shard, ZkStateReader.CORE_NAME_PROP, coreName, ZkStateReader.STATE_PROP, Replica.State.RECOVERING.toString(), ZkStateReader.NODE_NAME_PROP, nodeName, "node", nodeName,
             ZkStateReader.REPLICA_TYPE, replicaPosition.type.name(), ZkStateReader.NUM_SHARDS_PROP, message.getStr(ZkStateReader.NUM_SHARDS_PROP), "shards", message.getStr("shards"),
             CommonAdminParams.WAIT_FOR_FINAL_STATE, Boolean.toString(waitForFinalState));
         props.getProperties().putAll(addReplicaProps.getProperties());
@@ -300,7 +308,7 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
       ocmh.overseer.getZkStateWriter().enqueueUpdate(clusterState, null, false);
       ocmh.overseer.getZkStateWriter().writePendingUpdates();
 
-      if (log.isDebugEnabled()) log.debug("Sending create call for {} replicas", coresToCreate.size());
+      if (log.isDebugEnabled()) log.debug("Sending create call for {} replicas for {}", coresToCreate.size(), collectionName);
       for (Map.Entry<String,ShardRequest> e : coresToCreate.entrySet()) {
         ShardRequest sreq = e.getValue();
         if (log.isDebugEnabled()) log.debug("Submit request to shard for for replica coreName={} total requests={} shards={}", e.getKey(), coresToCreate.size(),
@@ -393,7 +401,7 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
                 }
                 for (Slice slice : slices) {
                   if (log.isTraceEnabled()) log.trace("slice {} leader={}", slice, slice.getLeader());
-                  if (slice.getLeader() == null || slice.getLeader().getState() != Replica.State.ACTIVE) {
+                  if (slice.getLeader() == null || (slice.getLeader() != null && slice.getLeader().getState() != Replica.State.ACTIVE)) {
                     if (log.isTraceEnabled()) log.trace("no leader found for slice {}", slice.getName());
                     return false;
                   }
@@ -402,10 +410,10 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
                 return true;
               });
             } catch (InterruptedException e) {
-              log.warn("Interrupted waiting for active replicas on collection creation {}", collectionName);
+              log.warn("Interrupted waiting for active replicas on collection creation collection={}", collectionName);
               throw new SolrException(ErrorCode.SERVER_ERROR, e);
             } catch (TimeoutException e) {
-              log.error("Exception waiting for active replicas on collection creation {}", collectionName);
+              log.error("Timeout waiting for active replicas on collection creation collection={}", collectionName);
               throw new SolrException(ErrorCode.SERVER_ERROR, e);
             }
           }
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteCollectionCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteCollectionCmd.java
index cd6379f..29dfd01 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteCollectionCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteCollectionCmd.java
@@ -37,10 +37,6 @@ import org.apache.solr.core.snapshots.SolrSnapshotManager;
 import org.apache.solr.handler.admin.MetricsHistoryHandler;
 import org.apache.solr.handler.component.ShardHandler;
 import org.apache.solr.metrics.SolrMetricManager;
-import org.apache.zookeeper.KeeperException;
-import org.apache.zookeeper.WatchedEvent;
-import org.apache.zookeeper.Watcher;
-import org.apache.zookeeper.data.Stat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -106,12 +102,14 @@ public class DeleteCollectionCmd implements OverseerCollectionMessageHandler.Cmd
     }
 
     log.info("Check if collection exists in zookeeper {}", collection);
-
+    CountDownLatch latch = new CountDownLatch(1);
+    zkStateReader.getZkClient().getSolrZooKeeper().sync(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection,  (rc, path, ctx) -> {
+      latch.countDown();
+    }, null);
+    latch.await(5, TimeUnit.SECONDS);
     if (!zkStateReader.getZkClient().exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection)) {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Could not find collection " + collection);
     }
-
-
     checkNotColocatedWith(zkStateReader, collection);
 
     final boolean deleteHistory = message.getBool(CoreAdminParams.DELETE_METRICS_HISTORY, true);
@@ -179,6 +177,7 @@ public class DeleteCollectionCmd implements OverseerCollectionMessageHandler.Cmd
     response.asyncFinalRunner = new OverseerCollectionMessageHandler.Finalize() {
       @Override
       public AddReplicaCmd.Response call() {
+        results.add("collection", collection);
         if (finalShardHandler != null && finalShardRequestTracker != null) {
           try {
             finalShardRequestTracker.processResponses(results, finalShardHandler, false, null, okayExceptions);
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java
index 9614b17..cc92226 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java
@@ -278,8 +278,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
   @SuppressWarnings("unchecked")
   public OverseerSolrResponse processMessage(ZkNodeProps message, String operation, ZkStateWriter zkWriter) throws InterruptedException {
     MDCLoggingContext.setCollection(message.getStr(COLLECTION));
-    MDCLoggingContext.setShard(message.getStr(SHARD_ID_PROP));
-    MDCLoggingContext.setReplica(message.getStr(REPLICA_PROP));
+    MDCLoggingContext.setCoreName(message.getStr(REPLICA_PROP));
     if (log.isDebugEnabled()) log.debug("OverseerCollectionMessageHandler.processMessage : {} , {}", operation, message);
 
     ClusterState clusterState = zkWriter.getClusterstate(false);
diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/OverseerAction.java b/solr/core/src/java/org/apache/solr/cloud/overseer/OverseerAction.java
index b9016ab..abcd76c 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/OverseerAction.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/OverseerAction.java
@@ -34,14 +34,15 @@ public enum OverseerAction {
   UPDATESHARDSTATE,
   STATE,
   QUIT,
-  DOWNNODE;
+  DOWNNODE,
+  RECOVERYNODE;
 
   public static OverseerAction get(String p) {
     if (p != null) {
       try {
         return OverseerAction.valueOf(p.toUpperCase(Locale.ROOT));
       } catch (Exception ex) {
-        ParWork.propagateInterrupt(ex);
+
       }
     }
     return null;
diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java
index 80013b4..25fdc23 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java
@@ -17,6 +17,7 @@
 package org.apache.solr.cloud.overseer;
 
 import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -56,6 +57,7 @@ public class ZkStateWriter {
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
   private final ZkStateReader reader;
+  private final Overseer overseer;
 
   /**
    * Represents a no-op {@link ZkWriteCommand} which will result in no modification to cluster state
@@ -85,9 +87,9 @@ public class ZkStateWriter {
   private Set<String> dirtyStructure = new HashSet<>();
   private Set<String> dirtyState = new HashSet<>();
 
-  public ZkStateWriter(ZkStateReader zkStateReader, Stats stats) {
+  public ZkStateWriter(ZkStateReader zkStateReader, Stats stats, Overseer overseer) {
     assert zkStateReader != null;
-
+    this.overseer = overseer;
     this.reader = zkStateReader;
     this.stats = stats;
 
@@ -187,55 +189,10 @@ public class ZkStateWriter {
             message.getProperties().remove("operation");
 
             for (Map.Entry<String,Object> entry : message.getProperties().entrySet()) {
-              if (entry.getKey().equalsIgnoreCase("downnode")) {
-                log.info("set downnode for {}", entry.getValue());
-                cs.forEachCollection(docColl -> {
-
-                  if (trackVersions.get(docColl.getName()) == null) {
-                    reader.forciblyRefreshClusterStateSlow(docColl.getName());
-                    DocCollection latestColl = reader.getClusterState().getCollectionOrNull(docColl.getName());
-
-                    if (latestColl == null) {
-                      //log.info("no node exists, using version 0");
-                      trackVersions.remove(docColl.getName());
-                    } else {
-                      cs.getCollectionStates().put(latestColl.getName(), new ClusterState.CollectionRef(latestColl));
-                      //log.info("got version from zk {}", existsStat.getVersion());
-                      int version = latestColl.getZNodeVersion();
-                      log.info("Updating local tracked version to {} for {}", version, docColl.getName());
-                      trackVersions.put(docColl.getName(), version);
-                    }
-                  }
-
-                  ZkNodeProps updates = stateUpdates.get(docColl.getName());
-                  if (updates == null) {
-                    updates = new ZkNodeProps();
-                    stateUpdates.put(docColl.getName(), updates);
-                  }
-                  Integer ver = trackVersions.get(docColl.getName());
-                  if (ver == null) {
-                    //   ver = docColl.getZNodeVersion();
-                    if (ver == null) {
-                      ver = 0;
-                    } else {
-
-                    }
-                  }
-                  updates.getProperties().put("_cs_ver_", ver.toString());
-                  List<Replica> replicas = docColl.getReplicas();
-                  for (Replica replica : replicas) {
-                    if (replica.getState() != Replica.State.DOWN && replica.getNodeName().equals(entry.getValue())) {
-                      log.info("set downnode for replica {}", replica);
-                      // nocommit
-                      Slice slice = docColl.getSlice(replica.getSlice());
-                      slice.setLeader(null);
-                      replica.setState(Replica.State.DOWN);
-                      updates.getProperties().put(replica.getName(), Replica.State.getShortState(Replica.State.DOWN));
-                      updates.getProperties().remove("leader");
-                      dirtyState.add(docColl.getName());
-                    }
-                  }
-                });
+              if (OverseerAction.get(entry.getKey()) == OverseerAction.DOWNNODE) {
+                nodeOperation(entry, Replica.State.getShortState(Replica.State.DOWN));
+              } if (OverseerAction.get(entry.getKey()) == OverseerAction.RECOVERYNODE) {
+                nodeOperation(entry, Replica.State.getShortState(Replica.State.RECOVERING));
               } else {
                 String core = entry.getKey();
                 String collectionAndStateString = (String) entry.getValue();
@@ -297,6 +254,7 @@ public class ZkStateWriter {
                         docColl.getSlice(replica).setLeader(null);
                       }
                       updates.getProperties().put(replica.getName(), Replica.State.getShortState(state));
+                      updates.getProperties().remove("leader");
                       // log.info("set state {} {}", state, replica);
                       replica.setState(state);
                       dirtyState.add(collection);
@@ -340,6 +298,57 @@ public class ZkStateWriter {
     }
   }
 
+  private void nodeOperation(Map.Entry<String,Object> entry, String operation) {
+    log.info("set {}} for {}", operation, entry.getValue());
+    cs.forEachCollection(docColl -> {
+
+      if (trackVersions.get(docColl.getName()) == null) {
+        reader.forciblyRefreshClusterStateSlow(docColl.getName());
+        DocCollection latestColl = reader.getClusterState().getCollectionOrNull(docColl.getName());
+
+        if (latestColl == null) {
+          //log.info("no node exists, using version 0");
+          trackVersions.remove(docColl.getName());
+        } else {
+          cs.getCollectionStates().put(latestColl.getName(), new ClusterState.CollectionRef(latestColl));
+          //log.info("got version from zk {}", existsStat.getVersion());
+          int version = latestColl.getZNodeVersion();
+          log.info("Updating local tracked version to {} for {}", version, docColl.getName());
+          trackVersions.put(docColl.getName(), version);
+        }
+      }
+
+      ZkNodeProps updates = stateUpdates.get(docColl.getName());
+      if (updates == null) {
+        updates = new ZkNodeProps();
+        stateUpdates.put(docColl.getName(), updates);
+      }
+      Integer ver = trackVersions.get(docColl.getName());
+      if (ver == null) {
+        //   ver = docColl.getZNodeVersion();
+        if (ver == null) {
+          ver = 0;
+        } else {
+
+        }
+      }
+      updates.getProperties().put("_cs_ver_", ver.toString());
+      List<Replica> replicas = docColl.getReplicas();
+      for (Replica replica : replicas) {
+        if (!Replica.State.getShortState(replica.getState()).equals(operation) && replica.getNodeName().equals(entry.getValue())) {
+          if (log.isDebugEnabled()) log.debug("set {} for replica {}", operation, replica);
+          // nocommit
+          Slice slice = docColl.getSlice(replica.getSlice());
+          slice.setLeader(null);
+          replica.setState(Replica.State.DOWN);
+          updates.getProperties().put(replica.getName(), operation);
+          updates.getProperties().remove("leader");
+          dirtyState.add(docColl.getName());
+        }
+      }
+    });
+  }
+
   public Integer lastWrittenVersion(String collection) {
     return trackVersions.get(collection);
   }
@@ -352,130 +361,160 @@ public class ZkStateWriter {
   // if additional updates too large, publish structure changew
   public void writePendingUpdates() {
 
-   // writeLock.lock();
-   // try {
-   //   log.info("Get our write lock");
-      ourLock.lock();
+    do {
       try {
-   //     log.info("Got our write lock");
+        write();
+        break;
+      } catch (KeeperException.BadVersionException e) {
 
-        throttle.minimumWaitBetweenActions();
-        throttle.markAttemptingAction();
+      } catch (Exception e) {
+        log.error("write pending failed", e);
+        break;
+      }
 
-        if (log.isTraceEnabled()) {
-          log.trace("writePendingUpdates {}", cs);
-        }
+    } while (!overseer.isClosed());
 
-        if (failedUpdates.size() > 0) {
-          log.warn("Some collection updates failed {} logging last exception", failedUpdates, lastFailedException); // nocommit expand
-          failedUpdates.clear();
-          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, lastFailedException.get());
-        }
+  }
+
+  private void write() throws KeeperException.BadVersionException {
+    // writeLock.lock();
+    // try {
+    //   log.info("Get our write lock");
+    ourLock.lock();
+    try {
+ //     log.info("Got our write lock");
+
+      throttle.minimumWaitBetweenActions();
+      throttle.markAttemptingAction();
+
+      if (log.isTraceEnabled()) {
+        log.trace("writePendingUpdates {}", cs);
+      }
+
+      if (failedUpdates.size() > 0) {
+        log.warn("Some collection updates failed {} logging last exception", failedUpdates, lastFailedException); // nocommit expand
+        failedUpdates.clear();
+        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, lastFailedException.get());
+      }
 //      } finally {
 //        ourLock.unlock();
 //      }
 
-      // wait to see our last publish version has propagated TODO don't wait on collections not hosted on overseer?
-      // waitForStateWePublishedToComeBack();
-
-   //   ourLock.lock();
-      AtomicInteger lastVersion = new AtomicInteger();
-      //log.info("writing out state, looking at collections count={} toWrite={} {} : {}", cs.getCollectionsMap().size(), collectionsToWrite.size(), cs.getCollectionsMap().keySet(), collectionsToWrite);
-      //try {
-        cs.forEachCollection(collection -> {
-         // log.info("check collection {}", collection);
-          if (dirtyStructure.contains(collection.getName()) || dirtyState.contains(collection.getName())) {
-          //  log.info("process collection {}", collection);
-            String name = collection.getName();
-            String path = ZkStateReader.getCollectionPath(collection.getName());
-            String pathSCN = ZkStateReader.getCollectionSCNPath(collection.getName());
-           // log.info("process collection {} path {}", collection.getName(), path);
-            Stat existsStat = null;
-            if (log.isTraceEnabled()) log.trace("process {}", collection);
+    // wait to see our last publish version has propagated TODO don't wait on collections not hosted on overseer?
+    // waitForStateWePublishedToComeBack();
+
+ //   ourLock.lock();
+    AtomicInteger lastVersion = new AtomicInteger();
+    AtomicReference<KeeperException.BadVersionException> badVersionException = new AtomicReference();
+    List<String> removeCollections = new ArrayList<>();
+    //log.info("writing out state, looking at collections count={} toWrite={} {} : {}", cs.getCollectionsMap().size(), collectionsToWrite.size(), cs.getCollectionsMap().keySet(), collectionsToWrite);
+    //try {
+      cs.forEachCollection(collection -> {
+       // log.info("check collection {}", collection);
+        Integer version = null;
+        if (dirtyStructure.contains(collection.getName()) || dirtyState.contains(collection.getName())) {
+        //  log.info("process collection {}", collection);
+          String name = collection.getName();
+          String path = ZkStateReader.getCollectionPath(collection.getName());
+          String pathSCN = ZkStateReader.getCollectionSCNPath(collection.getName());
+         // log.info("process collection {} path {}", collection.getName(), path);
+          Stat existsStat = null;
+          if (log.isTraceEnabled()) log.trace("process {}", collection);
+          try {
+           // log.info("get data for {}", name);
+            byte[] data = Utils.toJSON(singletonMap(name, collection));
+          //  log.info("got data for {} {}", name, data.length);
+
             try {
-             // log.info("get data for {}", name);
-              byte[] data = Utils.toJSON(singletonMap(name, collection));
-            //  log.info("got data for {} {}", name, data.length);
+              Integer v = trackVersions.get(collection.getName());
+
+              if (v != null) {
+                //log.info("got version from cache {}", v);
+                version = v;
+              } else {
+                version = 0;
+              }
+              lastVersion.set(version);
+              if (log.isDebugEnabled()) log.debug("Write state.json prevVersion={} bytes={} col={}", version, data.length, collection);
 
-              try {
-                Integer version = null;
-                Integer v = trackVersions.get(collection.getName());
+              reader.getZkClient().setData(path, data, version, true);
+              trackVersions.put(collection.getName(), version + 1);
+              if (dirtyStructure.contains(collection.getName())) {
+                if (log.isDebugEnabled()) log.debug("structure change in {}", collection.getName());
+                dirtyStructure.remove(collection.getName());
+                reader.getZkClient().setData(pathSCN, null, -1, true);
 
-                if (v != null) {
-                  //log.info("got version from cache {}", v);
-                  version = v;
-                } else {
-                  version = 0;
-                }
-                lastVersion.set(version);
-                if (log.isDebugEnabled()) log.debug("Write state.json prevVersion={} bytes={} col={}", version, data.length, collection);
-
-                reader.getZkClient().setData(path, data, version, true);
-                trackVersions.put(collection.getName(), version + 1);
-                if (dirtyStructure.contains(collection.getName())) {
-                  if (log.isDebugEnabled()) log.debug("structure change in {}", collection.getName());
-                  dirtyStructure.remove(collection.getName());
-                  reader.getZkClient().setData(pathSCN, null, -1, true);
-
-                  ZkNodeProps updates = stateUpdates.get(collection.getName());
-                  if (updates != null) {
-                    updates.getProperties().clear();
-                  }
+                ZkNodeProps updates = stateUpdates.get(collection.getName());
+                if (updates != null) {
+                  updates.getProperties().clear();
                 }
+              }
 
-              } catch (KeeperException.NoNodeException e) {
-                if (log.isDebugEnabled()) log.debug("No node found for state.json", e);
+            } catch (KeeperException.NoNodeException e) {
+              if (log.isDebugEnabled()) log.debug("No node found for state.json", e);
 
-                lastVersion.set(-1);
-              //  trackVersions.remove(collection.getName());
-                // likely deleted
-                return;
+              lastVersion.set(-1);
+            //  trackVersions.remove(collection.getName());
+              // likely deleted
 
-              } catch (KeeperException.BadVersionException bve) {
-                //lastFailedException.set(bve);
-                //failedUpdates.put(collection.getName(), collection);
-               // Stat estate = reader.getZkClient().exists(path, null);
-                trackVersions.remove(collection.getName());
-                throw bve;
+            } catch (KeeperException.BadVersionException bve) {
+              //lastFailedException.set(bve);
+              //failedUpdates.put(collection.getName(), collection);
+             // Stat estate = reader.getZkClient().exists(path, null);
+              trackVersions.remove(collection.getName());
+              Stat stat = reader.getZkClient().exists(path, null);
+              log.error("Tried to update state.json ({}) with bad version {} \n {}", collection, version, stat != null ? stat.getVersion() : "null");
 
+              if (!overseer.isClosed() && stat != null) {
+                trackVersions.put(collection.getName(), stat.getVersion());
+              } else {
+                removeCollections.add(collection.getName());
               }
 
-              if (dirtyState.contains(collection.getName())) {
-                ZkNodeProps updates = stateUpdates.get(collection.getName());
-                if (updates != null) {
-                  String stateUpdatesPath = ZkStateReader.getCollectionStateUpdatesPath(collection.getName());
-                  if (log.isDebugEnabled()) log.debug("write state updates for collection {} {}", collection.getName(), updates);
-                  dirtyState.remove(collection.getName());
-                  reader.getZkClient().setData(stateUpdatesPath, Utils.toJSON(updates), -1, true);
-                }
+              throw bve;
+            }
+
+            if (dirtyState.contains(collection.getName())) {
+              ZkNodeProps updates = stateUpdates.get(collection.getName());
+              if (updates != null) {
+                String stateUpdatesPath = ZkStateReader.getCollectionStateUpdatesPath(collection.getName());
+                if (log.isDebugEnabled()) log.debug("write state updates for collection {} {}", collection.getName(), updates);
+                dirtyState.remove(collection.getName());
+                reader.getZkClient().setData(stateUpdatesPath, Utils.toJSON(updates), -1, true);
               }
+            }
 
-            } catch (InterruptedException | AlreadyClosedException e) {
-              log.info("We have been closed or one of our resources has, bailing {}", e.getClass().getSimpleName() + ":" + e.getMessage());
+          } catch (KeeperException.BadVersionException bve) {
+            badVersionException.set(bve);
+          } catch (InterruptedException | AlreadyClosedException e) {
+            log.info("We have been closed or one of our resources has, bailing {}", e.getClass().getSimpleName() + ":" + e.getMessage());
 
-            } catch (Exception e) {
-              log.error("Failed processing update=" + collection, e);
-            }
+          } catch (Exception e) {
+            log.error("Failed processing update=" + collection, e);
           }
+        }
 
-        });
+      });
 
+      removeCollections.forEach(c -> removeCollection(c));
 
+      if (badVersionException.get() != null) {
+        throw badVersionException.get();
+      }
 
-        //log.info("Done with successful cluster write out");
+      //log.info("Done with successful cluster write out");
 
-      } finally {
-        ourLock.unlock();
-      }
-//    } finally {
-//      writeLock.unlock();
-//    }
+    } finally {
+      ourLock.unlock();
+    }
+    //    } finally {
+    //      writeLock.unlock();
+    //    }
     // nocommit - harden against failures and exceptions
 
     //    if (log.isDebugEnabled()) {
     //      log.debug("writePendingUpdates() - end - New Cluster State is: {}", newClusterState);
     //    }
-
   }
 
   private void waitForStateWePublishedToComeBack() {
diff --git a/solr/core/src/java/org/apache/solr/core/CachingDirectoryFactory.java b/solr/core/src/java/org/apache/solr/core/CachingDirectoryFactory.java
index 28c8028..5a1ce50 100644
--- a/solr/core/src/java/org/apache/solr/core/CachingDirectoryFactory.java
+++ b/solr/core/src/java/org/apache/solr/core/CachingDirectoryFactory.java
@@ -126,9 +126,8 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
 
   @Override
   public void addCloseListener(Directory dir, CloseListener closeListener) {
-    if (log.isDebugEnabled()) {
-      log.debug("addCloseListener(Directory dir={}, CloseListener closeListener={}) - start", dir, closeListener);
-    }
+    if (log.isTraceEnabled()) log.trace("addCloseListener(Directory dir={}, CloseListener closeListener={}) - start", dir, closeListener);
+
 
     synchronized (this) {
       if (!byDirectoryCache.containsKey(dir)) {
@@ -145,16 +144,12 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       closeListeners.put(dir, listeners);
     }
 
-    if (log.isDebugEnabled()) {
-      log.debug("addCloseListener(Directory, CloseListener) - end");
-    }
+    if (log.isTraceEnabled()) log.trace("addCloseListener(Directory, CloseListener) - end");
   }
 
   @Override
   public void doneWithDirectory(Directory directory) throws IOException {
-    if (log.isDebugEnabled()) {
-      log.debug("doneWithDirectory(Directory directory={}) - start", directory);
-    }
+    if (log.isTraceEnabled()) log.trace("doneWithDirectory(Directory directory={}) - start", directory);
 
     synchronized (this) {
       CacheValue cacheValue = byDirectoryCache.get(directory);
@@ -172,9 +167,8 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       }
     }
 
-    if (log.isDebugEnabled()) {
-      log.debug("doneWithDirectory(Directory) - end");
-    }
+
+    if (log.isTraceEnabled()) log.trace("doneWithDirectory(Directory) - end");
   }
 
   /*
@@ -184,9 +178,8 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
    */
   @Override
   public void close() throws IOException {
-    if (log.isDebugEnabled()) {
-      log.debug("close() - start");
-    }
+    if (log.isTraceEnabled()) log.trace("close() - start");
+
 
     synchronized (this) {
       closed = true;
@@ -227,31 +220,23 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       }
     }
 
-    if (log.isDebugEnabled()) {
-      log.debug("close() - end");
-    }
+    if (log.isTraceEnabled()) log.trace("close() - end");
   }
 
   private synchronized void removeFromCache(CacheValue v) {
-    if (log.isDebugEnabled()) {
-      log.debug("removeFromCache(CacheValue v={}) - start", v);
-    }
+    if (log.isTraceEnabled()) log.trace("removeFromCache(CacheValue v={}) - start", v);
 
     if (log.isDebugEnabled()) log.debug("Removing from cache: {}", v);
     byDirectoryCache.remove(v.directory);
     byPathCache.remove(v.path);
 
-    if (log.isDebugEnabled()) {
-      log.debug("removeFromCache(CacheValue) - end");
-    }
+    if (log.isTraceEnabled()) log.trace("removeFromCache(CacheValue) - end");
   }
 
   // be sure this is called with the this sync lock
   // returns true if we closed the cacheValue, false if it will be closed later
   private boolean closeCacheValue(CacheValue cacheValue) {
-    if (log.isDebugEnabled()) {
-      log.debug("closeCacheValue(CacheValue cacheValue={}) - start", cacheValue);
-    }
+    if (log.isTraceEnabled()) log.trace("closeCacheValue(CacheValue cacheValue={}) - start", cacheValue);
 
     if (log.isDebugEnabled()) log.debug("looking to close {} {}", cacheValue.path, cacheValue.closeEntries.toString());
     List<CloseListener> listeners = closeListeners.remove(cacheValue.directory);
@@ -330,16 +315,12 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       }
     }
 
-    if (log.isDebugEnabled()) {
-      log.debug("closeCacheValue(CacheValue) - end");
-    }
+    if (log.isTraceEnabled()) log.trace("closeCacheValue(CacheValue) - end");
     return cl;
   }
 
   private void close(CacheValue val) {
-    if (log.isDebugEnabled()) {
-      log.debug("close(CacheValue val={}) - start", val);
-    }
+    if (log.isTraceEnabled()) log.trace("close(CacheValue val={}) - start", val);
 
     if (log.isDebugEnabled()) log.debug("Closing directory, CoreContainer#isShutdown={}", coreContainer != null ? coreContainer.isShutDown() : "null");
     try {
@@ -357,38 +338,32 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       ParWork.propagateInterrupt("Error closing directory", e);
     }
 
-    if (log.isDebugEnabled()) {
-      log.debug("close(CacheValue) - end");
-    }
+    if (log.isTraceEnabled()) log.trace("close(CacheValue) - end");
   }
 
   private boolean isSubPath(CacheValue cacheValue, CacheValue otherCacheValue) {
-    if (log.isDebugEnabled()) {
-      log.debug("isSubPath(CacheValue cacheValue={}, CacheValue otherCacheValue={}) - start", cacheValue, otherCacheValue);
-    }
+    if (log.isTraceEnabled()) log.trace("isSubPath(CacheValue cacheValue={}, CacheValue otherCacheValue={}) - start", cacheValue, otherCacheValue);
 
     int one = cacheValue.path.lastIndexOf('/');
     int two = otherCacheValue.path.lastIndexOf('/');
 
     boolean returnboolean = otherCacheValue.path.startsWith(cacheValue.path + "/") && two > one;
-    if (log.isDebugEnabled()) {
-      log.debug("isSubPath(CacheValue, CacheValue) - end");
-    }
+
+    if (log.isTraceEnabled()) log.trace("isSubPath(CacheValue, CacheValue) - end");
+
     return returnboolean;
   }
 
   @Override
   public boolean exists(String path) throws IOException {
-    if (log.isDebugEnabled()) {
-      log.debug("exists(String path={}) - start", path);
-    }
+    if (log.isTraceEnabled()) log.trace("exists(String path={}) - start", path);
 
     // back compat behavior
     File dirFile = new File(path);
     boolean returnboolean = dirFile.canRead() && dirFile.list().length > 0;
-    if (log.isDebugEnabled()) {
-      log.debug("exists(String) - end");
-    }
+
+    if (log.isTraceEnabled()) log.trace("exists(String) - end");
+
     return returnboolean;
   }
 
@@ -401,9 +376,7 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
   @Override
   public final Directory get(String path, DirContext dirContext, String rawLockType)
           throws IOException {
-    if (log.isDebugEnabled()) {
-      log.debug("get(String path={}, DirContext dirContext={}, String rawLockType={}) - start", path, dirContext, rawLockType);
-    }
+    if (log.isTraceEnabled()) log.trace("get(String path={}, DirContext dirContext={}, String rawLockType={}) - start", path, dirContext, rawLockType);
 
     if (closed) {
       throw new AlreadyClosedException();
@@ -443,9 +416,8 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       //    log.info("getDir " + path, new RuntimeException("track get " + fullPath)); // nocommit
       // }
 
-      if (log.isDebugEnabled()) {
-        log.debug("get(String, DirContext, String) - end");
-      }
+      if (log.isTraceEnabled()) log.trace("get(String, DirContext, String) - end");
+
       return directory;
     }
   }
@@ -459,9 +431,7 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
    */
   @Override
   public void incRef(Directory directory) {
-    if (log.isDebugEnabled()) {
-      log.debug("incRef(Directory directory={}) - start", directory);
-    }
+    if (log.isTraceEnabled()) log.trace("incRef(Directory directory={}) - start", directory);
 
     synchronized (this) {
       CacheValue cacheValue = byDirectoryCache.get(directory);
@@ -473,16 +443,12 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       log.debug("incRef'ed: {}", cacheValue,  DEBUG_GET_RELEASE && cacheValue.path.equals("data/index") ? new RuntimeException() : null);
     }
 
-    if (log.isDebugEnabled()) {
-      log.debug("incRef(Directory) - end");
-    }
+    if (log.isTraceEnabled()) log.trace("incRef(Directory) - end");
   }
 
   @Override
   public void init(NamedList args) {
-    if (log.isDebugEnabled()) {
-      log.debug("init(NamedList args={}) - start", args);
-    }
+    if (log.isTraceEnabled()) log.trace("init(NamedList args={}) - start", args);
 
     maxWriteMBPerSecFlush = (Double) args.get("maxWriteMBPerSecFlush");
     maxWriteMBPerSecMerge = (Double) args.get("maxWriteMBPerSecMerge");
@@ -497,9 +463,7 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       log.info(SolrXmlConfig.SOLR_DATA_HOME + "=" + dataHomePath);
     }
 
-    if (log.isDebugEnabled()) {
-      log.debug("init(NamedList) - end");
-    }
+    if (log.isTraceEnabled()) log.trace("init(NamedList) - end");
   }
 
   /*
@@ -511,9 +475,7 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
    */
   @Override
   public void release(Directory directory) throws IOException {
-    if (log.isDebugEnabled()) {
-      log.debug("release(Directory directory={}) - start", directory);
-    }
+    if (log.isTraceEnabled()) log.trace("release(Directory directory={}) - start", directory);
 
     if (directory == null) {
       throw new NullPointerException();
@@ -540,7 +502,7 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       //    }
       cacheValue.refCnt--;
 
-      if (cacheValue.refCnt == 0 && cacheValue.doneWithDir ||  closed) {
+      if (cacheValue.refCnt == 0 && cacheValue.doneWithDir || closed) {
         boolean cl = closeCacheValue(cacheValue);
         if (cl) {
           removeFromCache(cacheValue);
@@ -548,42 +510,28 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       }
     }
 
-    if (log.isDebugEnabled()) {
-      log.debug("release(Directory) - end");
-    }
+    if (log.isTraceEnabled()) log.trace("release(Directory) - end");
   }
 
   @Override
   public void remove(String path) throws IOException {
-    if (log.isDebugEnabled()) {
-      log.debug("remove(String path={}) - start", path);
-    }
+    if (log.isTraceEnabled()) log.trace("remove(String path={}) - start", path);
 
     remove(path, false);
 
-    if (log.isDebugEnabled()) {
-      log.debug("remove(String) - end");
-    }
+    if (log.isTraceEnabled()) log.trace("remove(String) - end");
   }
 
   @Override
   public void remove(Directory dir) throws IOException {
-    if (log.isDebugEnabled()) {
-      log.debug("remove(Directory dir={}) - start", dir);
-    }
+    if (log.isTraceEnabled()) log.trace("remove(Directory dir={}) - start", dir);
 
     remove(dir, false);
-
-    if (log.isDebugEnabled()) {
-      log.debug("remove(Directory) - end");
-    }
   }
 
   @Override
   public void remove(String path, boolean deleteAfterCoreClose) throws IOException {
-    if (log.isDebugEnabled()) {
-      log.debug("remove(String path={}, boolean deleteAfterCoreClose={}) - start", path, deleteAfterCoreClose);
-    }
+    if (log.isTraceEnabled()) log.trace("remove(String path={}, boolean deleteAfterCoreClose={}) - start", path, deleteAfterCoreClose);
 
     synchronized (this) {
       CacheValue val = byPathCache.get(normalize(path));
@@ -592,17 +540,12 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       }
       val.setDeleteOnClose(true, deleteAfterCoreClose);
     }
-
-    if (log.isDebugEnabled()) {
-      log.debug("remove(String, boolean) - end");
-    }
   }
 
   @Override
   public void remove(Directory dir, boolean deleteAfterCoreClose) throws IOException {
-    if (log.isDebugEnabled()) {
-      log.debug("remove(Directory dir={}, boolean deleteAfterCoreClose={}) - start", dir, deleteAfterCoreClose);
-    }
+    if (log.isTraceEnabled()) log.trace("remove(Directory dir={}, boolean deleteAfterCoreClose={}) - start", dir, deleteAfterCoreClose);
+
 
     synchronized (this) {
       CacheValue val = byDirectoryCache.get(dir);
@@ -611,10 +554,6 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       }
       val.setDeleteOnClose(true, deleteAfterCoreClose);
     }
-
-    if (log.isDebugEnabled()) {
-      log.debug("remove(Directory, boolean) - end");
-    }
   }
 
   protected void removeDirectory(CacheValue cacheValue) throws IOException {
@@ -623,30 +562,21 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
 
   @Override
   public String normalize(String path) throws IOException {
-    if (log.isDebugEnabled()) {
-      log.debug("normalize(String path={}) - start", path);
-    }
+    if (log.isTraceEnabled()) log.trace("normalize(String path={}) - start", path);
+
 
     path = stripTrailingSlash(path);
 
-    if (log.isDebugEnabled()) {
-      log.debug("normalize(String) - end");
-    }
     return path;
   }
 
   protected String stripTrailingSlash(String path) {
-    if (log.isDebugEnabled()) {
-      log.debug("stripTrailingSlash(String path={}) - start", path);
-    }
+    if (log.isTraceEnabled()) log.trace("stripTrailingSlash(String path={}) - start", path);
 
     if (path.endsWith("/")) {
       path = path.substring(0, path.length() - 1);
     }
 
-    if (log.isDebugEnabled()) {
-      log.debug("stripTrailingSlash(String) - end");
-    }
     return path;
   }
 
@@ -657,9 +587,7 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
    * @see #doneWithDirectory
    */
   public synchronized Set<String> getLivePaths() {
-    if (log.isDebugEnabled()) {
-      log.debug("getLivePaths() - start");
-    }
+    if (log.isTraceEnabled()) log.trace("getLivePaths() - start");
 
     HashSet<String> livePaths = new HashSet<>(byPathCache.size());
     for (CacheValue val : byPathCache.values()) {
@@ -668,17 +596,14 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       }
     }
 
-    if (log.isDebugEnabled()) {
-      log.debug("getLivePaths() - end");
-    }
+    if (log.isTraceEnabled()) log.trace("getLivePaths() - end");
+
     return livePaths;
   }
 
   @Override
   protected boolean deleteOldIndexDirectory(String oldDirPath) throws IOException {
-    if (log.isDebugEnabled()) {
-      log.debug("deleteOldIndexDirectory(String oldDirPath={}) - start", oldDirPath);
-    }
+    if (log.isTraceEnabled()) log.trace("deleteOldIndexDirectory(String oldDirPath={}) - start", oldDirPath);
 
     Set<String> livePaths = getLivePaths();
     if (livePaths.contains(oldDirPath)) {
@@ -690,13 +615,8 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
   }
 
   protected synchronized String getPath(Directory directory) {
-    if (log.isDebugEnabled()) {
-      log.debug("getPath(Directory directory={}) - start", directory);
-    }
+    if (log.isTraceEnabled()) log.trace("getPath(Directory directory={}) - start", directory);
 
-    if (log.isDebugEnabled()) {
-      log.debug("getPath(Directory) - end");
-    }
     return byDirectoryCache.get(directory).path;
   }
 }
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index ab33eae..0bcbe4b 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -25,7 +25,6 @@ import org.apache.http.config.Lookup;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.store.Directory;
 import org.apache.solr.client.solrj.SolrClient;
 import org.apache.solr.client.solrj.cloud.SolrCloudManager;
 import org.apache.solr.client.solrj.impl.CloudHttp2SolrClient;
@@ -40,7 +39,6 @@ import org.apache.solr.cloud.ZkController;
 import org.apache.solr.cloud.overseer.OverseerAction;
 import org.apache.solr.common.AlreadyClosedException;
 import org.apache.solr.common.ParWork;
-import org.apache.solr.common.PerThreadExecService;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.cloud.DocCollection;
@@ -55,13 +53,12 @@ import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.common.util.ObjectCache;
 import org.apache.solr.common.util.ObjectReleaseTracker;
 import org.apache.solr.common.util.OrderedExecutor;
+import org.apache.solr.common.util.SysStats;
 import org.apache.solr.common.util.Utils;
-import org.apache.solr.core.DirectoryFactory.DirContext;
 import org.apache.solr.core.backup.repository.BackupRepository;
 import org.apache.solr.core.backup.repository.BackupRepositoryFactory;
 import org.apache.solr.filestore.PackageStoreAPI;
 import org.apache.solr.handler.RequestHandlerBase;
-import org.apache.solr.handler.SnapShooter;
 import org.apache.solr.handler.admin.CollectionsHandler;
 import org.apache.solr.handler.admin.ConfigSetsHandler;
 import org.apache.solr.handler.admin.CoreAdminHandler;
@@ -102,7 +99,6 @@ import org.apache.solr.util.SystemIdResolver;
 import org.apache.zookeeper.KeeperException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.slf4j.MDC;
 
 import static java.util.Objects.requireNonNull;
 import static org.apache.solr.common.params.CommonParams.AUTHC_PATH;
@@ -126,12 +122,10 @@ import java.nio.file.NoSuchFileException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.security.spec.InvalidKeySpecException;
-import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Comparator;
-import java.util.Date;
 import java.util.HashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
@@ -146,6 +140,7 @@ import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
 import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.RejectedExecutionException;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.concurrent.locks.ReentrantLock;
@@ -158,7 +153,6 @@ public class CoreContainer implements Closeable {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   final SolrCores solrCores = new SolrCores(this);
-  private final boolean isZkAware;
   private volatile boolean startedLoadingCores;
   private volatile boolean loaded;
 
@@ -195,9 +189,7 @@ public class CoreContainer implements Closeable {
 
   private volatile UpdateShardHandler updateShardHandler;
 
-  public volatile ExecutorService solrCoreLoadExecutor;
-
-  public volatile ExecutorService solrCoreCloseExecutor;
+  public volatile ExecutorService solrCoreExecutor;
 
   private final OrderedExecutor replayUpdatesExecutor;
 
@@ -347,19 +339,18 @@ public class CoreContainer implements Closeable {
     assert ObjectReleaseTracker.track(this);
     assert (closeTracker = new CloseTracker()) != null;
     this.containerProperties = new Properties(config.getSolrProperties());
-    String zkHost = System.getProperty("zkHost");
-    if (!StringUtils.isEmpty(zkHost)) {
-      zkSys = new ZkContainer(zkClient);
-      isZkAware = true;
-    } else {
-      isZkAware = false;
-    }
 
     this.loader = config.getSolrResourceLoader();
 
     this.solrHome = config.getSolrHome();
     this.cfg = requireNonNull(config);
 
+    if (zkClient != null) {
+      zkSys = new ZkContainer(zkClient);
+      zkSys.initZooKeeper(this, cfg.getCloudConfig());
+      MDCLoggingContext.setNode(zkSys.getZkController().getNodeName());
+    }
+
     if (null != this.cfg.getBooleanQueryMaxClauseCount()) {
       IndexSearcher.setMaxClauseCount(this.cfg.getBooleanQueryMaxClauseCount());
     }
@@ -403,18 +394,13 @@ public class CoreContainer implements Closeable {
         }
       });
     }
-    if (zkClient != null) {
-      zkSys.initZooKeeper(this, cfg.getCloudConfig());
-    }
+
     coreConfigService = ConfigSetService.createConfigSetService(cfg, loader, zkSys == null ? null : zkSys.zkController);
 
     containerProperties.putAll(cfg.getSolrProperties());
 
-    solrCoreLoadExecutor = new PerThreadExecService(ParWork.getRootSharedExecutor(), Math.max(16, Runtime.getRuntime().availableProcessors()),
-        false, false);
-
-    solrCoreCloseExecutor = new PerThreadExecService(ParWork.getRootSharedExecutor(), Math.max(16, Runtime.getRuntime().availableProcessors()),
-        false, false);
+    solrCoreExecutor = ParWork.getParExecutorService("Core",
+        4, Math.max(6, SysStats.PROC_COUNT * 2), 1000, new LinkedBlockingQueue<>(1024));
   }
 
   @SuppressWarnings({"unchecked"})
@@ -606,7 +592,6 @@ public class CoreContainer implements Closeable {
     cfg = null;
     containerProperties = null;
     replayUpdatesExecutor = null;
-    isZkAware = false;
   }
 
 
@@ -681,6 +666,10 @@ public class CoreContainer implements Closeable {
    * Load the cores defined for this CoreContainer
    */
   public void load() {
+    if (isZooKeeperAware()) {
+      MDCLoggingContext.setNode(zkSys.getZkController().getNodeName());
+    }
+
     long start = System.nanoTime();
     if (log.isDebugEnabled()) {
       log.debug("Loading cores into CoreContainer [instanceDir={}]", getSolrHome());
@@ -776,7 +765,6 @@ public class CoreContainer implements Closeable {
         }
 
         work.collect("", () -> {
-          MDCLoggingContext.setNode(this);
           securityConfHandler = isZooKeeperAware() ? new SecurityConfHandlerZk(this) : new SecurityConfHandlerLocal(this);
           securityConfHandler.initializeMetrics(solrMetricsContext, AUTHZ_PATH);
           containerHandlers.put(AUTHC_PATH, securityConfHandler);
@@ -881,26 +869,6 @@ public class CoreContainer implements Closeable {
     status |= CORE_DISCOVERY_COMPLETE;
     startedLoadingCores = true;
     for (final CoreDescriptor cd : cds) {
-//      if (isZooKeeperAware()) {
-//        String collection = cd.getCollectionName();
-//        try {
-//          zkSys.zkController.zkStateReader.waitForState(collection, 5, TimeUnit.SECONDS, (n, c) -> {
-//            if (c != null) {
-//              Replica replica = c.getReplica(cd.getName());
-//
-//              if (replica.getState().equals(State.DOWN)) {
-//                return true;
-//              }
-//
-//            }
-//            return false;
-//          });
-//        } catch (InterruptedException e) {
-//          ParWork.propagateInterrupt(e);
-//        } catch (TimeoutException e) {
-//          log.error("Timeout", e);
-//        }
-//      }
 
       if (log.isDebugEnabled()) log.debug("Process core descriptor {} {} {}", cd.getName(), cd.isTransient(), cd.isLoadOnStartup());
       if (cd.isTransient() || !cd.isLoadOnStartup()) {
@@ -908,28 +876,40 @@ public class CoreContainer implements Closeable {
       } else {
         solrCores.markCoreAsLoading(cd);
       }
+
+      if (isZooKeeperAware()) {
+        String collection = cd.getCollectionName();
+
+        if (!zkSys.zkController.getClusterState().hasCollection(collection)) {
+          try {
+            coresLocator.delete(this, cd);
+          } catch (Exception e) {
+            log.error("Exception deleting core.properties file", e);
+          }
+
+          unload(cd, cd.getName(),true, true, true);
+
+          continue;
+        }
+      }
+
       if (cd.isLoadOnStartup()) {
 
-        coreLoadFutures.add(solrCoreLoadExecutor.submit(() -> {
-          SolrCore core;
-          MDCLoggingContext.setCoreDescriptor(this, cd);
+        coreLoadFutures.add(solrCoreExecutor.submit(() -> {
+          SolrCore core = null;
+          MDCLoggingContext.setCoreName(cd.getName());
           try {
             try {
 
               core = createFromDescriptor(cd, false);
 
-              if (core.getDirectoryFactory().isSharedStorage()) {
-                if (isZooKeeperAware()) {
-                  zkSys.getZkController().throwErrorIfReplicaReplaced(cd);
-                }
-              }
-
             } finally {
               solrCores.markCoreAsNotLoading(cd);
             }
-            if (isZooKeeperAware()) {
-              new ZkController.RegisterCoreAsync(zkSys.zkController, cd, false).call();
-            }
+
+          } catch (Exception e){
+            log.error("Error creating and register core {}", cd.getName(), e);
+            throw e;
           } finally {
             MDCLoggingContext.clear();
           }
@@ -937,7 +917,9 @@ public class CoreContainer implements Closeable {
         }));
       }
     }
+
     if (isZooKeeperAware()) {
+      // TODO: should make sure we wait till no one is active before this, but would have to be before core load
       zkSys.getZkController().createEphemeralLiveNode();
     }
 
@@ -1087,10 +1069,6 @@ public class CoreContainer implements Closeable {
         replayUpdatesExecutor.shutdownAndAwaitTermination();
       });
 
-      if (solrCoreLoadExecutor != null) {
-        solrCoreLoadExecutor.shutdown();
-      }
-
       List<Callable<?>> callables = new ArrayList<>();
 
       if (metricManager != null) {
@@ -1158,29 +1136,21 @@ public class CoreContainer implements Closeable {
       closer.collect(callables);
       closer.collect(metricsHistoryHandler);
 
-
-      closer.collect(solrCoreLoadExecutor);
-
-
       closer.collect("WaitForSolrCores", solrCores);
 
-
       closer.addCollect();
 
       closer.collect(shardHandlerFactory);
       closer.collect(updateShardHandler);
 
-
-      closer.collect(solrCoreCloseExecutor);
       closer.collect(solrClientCache);
 
       closer.collect(loader);
 
       closer.collect();
 
+      closer.collect(solrCoreExecutor);
       closer.collect(zkSys);
-
-
     }
     log.info("CoreContainer closed");
     assert ObjectReleaseTracker.release(this);
@@ -1288,12 +1258,11 @@ public class CoreContainer implements Closeable {
     SolrCore core = null;
     CoreDescriptor cd = new CoreDescriptor(coreName, instancePath, parameters, getContainerProperties(), getZkController());
 
-    // nocommit
-//    if (getAllCoreNames().contains(coreName)) {
-//      log.warn("Creating a core with existing name is not allowed");
-//      // TODO: Shouldn't this be a BAD_REQUEST?
-//      throw new SolrException(ErrorCode.SERVER_ERROR, "Core with name '" + coreName + "' already exists.");
-//    }
+    if (getAllCoreNames().contains(coreName) || solrCores.isCoreLoading(coreName)) {
+      log.warn("Creating a core with existing name is not allowed {}", coreName);
+
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Core with name '" + coreName + "' already exists.");
+    }
 
     boolean preExisitingZkEntry = false;
     try {
@@ -1326,7 +1295,7 @@ public class CoreContainer implements Closeable {
       coresLocator.delete(this, cd);
       if (isZooKeeperAware() && !preExisitingZkEntry) {
         try {
-          getZkController().unregister(coreName, cd);
+          getZkController().unregister(coreName, cd.getCollectionName(), cd.getCloudDescriptor().getShardId());
         } catch (Exception e) {
           log.error("", e);
         }
@@ -1392,7 +1361,7 @@ public class CoreContainer implements Closeable {
     SolrCore old = null;
     boolean registered = false;
     try {
-      MDCLoggingContext.setCoreDescriptor(this, dcore);
+      MDCLoggingContext.setCoreName(dcore.getName());
       SolrIdentifierValidator.validateCoreName(dcore.getName());
 
       ConfigSet coreConfig = coreConfigService.loadConfigSet(dcore);
@@ -1409,9 +1378,7 @@ public class CoreContainer implements Closeable {
             throw new AlreadyClosedException("Solr has been shutdown.");
           }
           solrCores.markCoreAsLoading(dcore);
-          if (isZooKeeperAware()) {
-            ParWork.getRootSharedExecutor().submit(new ZkController.RegisterCoreAsync(zkSys.zkController, dcore, false));
-          }
+
           core = new SolrCore(this, dcore, coreConfig);
         } catch (Exception e) {
           core = processCoreCreateException(e, dcore, coreConfig);
@@ -1421,6 +1388,17 @@ public class CoreContainer implements Closeable {
 
         old = registerCore(dcore, core, true);
         registered = true;
+        solrCores.markCoreAsNotLoading(dcore);
+
+        if (isZooKeeperAware()) {
+          if (!newCollection) {
+            if (core.getDirectoryFactory().isSharedStorage()) {
+              zkSys.getZkController().throwErrorIfReplicaReplaced(dcore);
+            }
+          }
+          ParWork.getRootSharedExecutor().submit(new ZkController.RegisterCoreAsync(zkSys.zkController, dcore, false));
+        }
+
       } catch (Exception e) {
 
         throw new SolrException(ErrorCode.SERVER_ERROR, e);
@@ -1462,24 +1440,34 @@ public class CoreContainer implements Closeable {
             if (core != null) {
 
               SolrCore finalCore1 = core;
-              solrCoreCloseExecutor.submit(() -> {
+              try {
+                solrCoreExecutor.submit(() -> {
+                  finalCore1.closeAndWait();
+                });
+              } catch (RejectedExecutionException e) {
                 finalCore1.closeAndWait();
-              });
+              }
               SolrCore finalOld = old;
-              solrCoreCloseExecutor.submit(() -> {
-                if (finalOld != null) {
-                  finalOld.closeAndWait();
-                }
-              });
+              try {
+                solrCoreExecutor.submit(() -> {
+                  if (finalOld != null) {
+                    finalOld.closeAndWait();
+                  }
+                });
+              } catch (RejectedExecutionException e) {
+                finalOld.closeAndWait();
+              }
             }
           }
           if (isShutDown) {
             SolrCore finalCore1 = core;
-            ParWork.getRootSharedExecutor().submit(() -> {
-
+            try {
+              solrCoreExecutor.submit(() -> {
+                finalCore1.closeAndWait();
+              });
+            } catch (RejectedExecutionException e) {
               finalCore1.closeAndWait();
-
-            });
+            }
           }
         }
       } finally {
@@ -1547,10 +1535,31 @@ public class CoreContainer implements Closeable {
                 .getLeader();
             if (leader != null && leader.getState() == State.ACTIVE) {
               log.info("Found active leader, will attempt to create fresh core and recover.");
-              resetIndexDirectory(dcore, coreConfig);
+
+              SolrConfig config = coreConfig.getSolrConfig();
+
+              String registryName = SolrMetricManager.getRegistryName(SolrInfoBean.Group.core, dcore.getName());
+              DirectoryFactory df = DirectoryFactory.loadDirectoryFactory(config, this, registryName);
+              String dataDir = SolrCore.findDataDir(df, null, config, dcore);
+              df.close();
+
+              try {
+                while (new File(dataDir).exists()) {
+                  try {
+                    Files.walk(new File(dataDir).toPath()).sorted(Comparator.reverseOrder()).map(Path::toFile).forEach(File::delete);
+                  } catch (NoSuchFileException e) {
+
+                  }
+                }
+              } catch (Exception e) {
+                SolrException.log(log, "Failed to delete instance dir for core:" + dcore.getName() + " dir:" + dcore.getInstanceDir());
+              }
+
+              SolrCore core = new SolrCore(this, dcore, coreConfig);
+              core.getUpdateHandler().getUpdateLog().deleteAll();
+
               // the index of this core is emptied, its term should be set to 0
               getZkController().getShardTerms(desc.getCollectionName(), desc.getShardId()).setTermToZero(dcore.getName());
-              return new SolrCore(this, dcore, coreConfig);
             }
           } catch (Exception se) {
             se.addSuppressed(original);
@@ -1584,35 +1593,6 @@ public class CoreContainer implements Closeable {
   }
 
   /**
-   * Write a new index directory for the a SolrCore, but do so without loading it.
-   */
-  private void resetIndexDirectory(CoreDescriptor dcore, ConfigSet coreConfig) {
-    SolrConfig config = coreConfig.getSolrConfig();
-
-    String registryName = SolrMetricManager.getRegistryName(SolrInfoBean.Group.core, dcore.getName());
-    DirectoryFactory df = DirectoryFactory.loadDirectoryFactory(config, this, registryName);
-    String dataDir = SolrCore.findDataDir(df, null, config, dcore);
-
-    String tmpIdxDirName = "index." + new SimpleDateFormat(SnapShooter.DATE_FMT, Locale.ROOT).format(new Date());
-    SolrCore.modifyIndexProps(df, dataDir, config, tmpIdxDirName);
-
-    // Free the directory object that we had to create for this
-    Directory dir = null;
-    try {
-      dir = df.get(dataDir, DirContext.META_DATA, config.indexConfig.lockType);
-    } catch (IOException e) {
-      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-    } finally {
-      try {
-        df.doneWithDirectory(dir);
-        df.release(dir);
-      } catch (IOException e) {
-        SolrException.log(log, e);
-      }
-    }
-  }
-
-  /**
    * @return a Collection of registered SolrCores
    */
   public Collection<SolrCore> getCores() {
@@ -1824,7 +1804,7 @@ public class CoreContainer implements Closeable {
           if (!success) {
             log.error("Failed reloading core, cleaning up new core");
             SolrCore finalNewCore = newCore;
-            solrCoreCloseExecutor.submit(() -> {
+            solrCoreExecutor.submit(() -> {
               //            try {
               if (finalNewCore != null) {
                 log.error("Closing failed new core");
@@ -1868,6 +1848,11 @@ public class CoreContainer implements Closeable {
     unload(name, false, false, false);
   }
 
+
+  public void unload(String name, boolean deleteIndexDir, boolean deleteDataDir, boolean deleteInstanceDir) {
+    unload(null, name, deleteIndexDir, deleteDataDir, deleteInstanceDir);
+  }
+
   /**
    * Unload a core from this container, optionally removing the core's data and configuration
    *
@@ -1876,58 +1861,67 @@ public class CoreContainer implements Closeable {
    * @param deleteDataDir     if true, delete the core's data directory on close
    * @param deleteInstanceDir if true, delete the core's instance directory on close
    */
-  public void unload(String name, boolean deleteIndexDir, boolean deleteDataDir, boolean deleteInstanceDir) {
+  public void unload(CoreDescriptor cd, String name, boolean deleteIndexDir, boolean deleteDataDir, boolean deleteInstanceDir) {
     log.info("Unload SolrCore {} deleteIndexDir={} deleteDataDir={} deleteInstanceDir={}", name, deleteIndexDir, deleteDataDir, deleteInstanceDir);
-    CoreDescriptor cd = solrCores.getCoreDescriptor(name);
+    if (cd == null) {
+      cd = solrCores.getCoreDescriptor(name);
+    }
     SolrException exception = null;
     try {
       if (name != null) {
+        CoreLoadFailure loadFailure = coreInitFailures.remove(name);
+        if (loadFailure != null) {
 
-        if (isZooKeeperAware()) {
-          getZkController().stopReplicationFromLeader(name);
-
-          if (cd != null) {
-            try {
-              zkSys.getZkController().unregister(name, cd);
-            } catch (AlreadyClosedException e) {
+          if (isZooKeeperAware()) {
+            if (cd != null) {
+              try {
+                zkSys.getZkController().unregister(name, cd.getCollectionName(), cd.getCloudDescriptor().getShardId());
+              } catch (AlreadyClosedException e) {
 
-            } catch (Exception e) {
-              log.error("Error unregistering core [" + name + "] from cloud state", e);
-              exception = new SolrException(ErrorCode.SERVER_ERROR, "Error unregistering core [" + name + "] from cloud state", e);
+              } catch (Exception e) {
+                log.error("Error unregistering core [" + name + "] from cloud state", e);
+                exception = new SolrException(ErrorCode.SERVER_ERROR, "Error unregistering core [" + name + "] from cloud state", e);
+              }
             }
           }
-        }
-        CoreLoadFailure loadFailure = coreInitFailures.remove(name);
-        if (loadFailure != null) {
+
           // getting the index directory requires opening a DirectoryFactory with a SolrConfig, etc,
           // which we may not be able to do because of the init error.  So we just go with what we
           // can glean from the CoreDescriptor - datadir and instancedir
           try {
             SolrCore.deleteUnloadedCore(loadFailure.cd, deleteDataDir, deleteInstanceDir);
             // If last time around we didn't successfully load, make sure that all traces of the coreDescriptor are gone.
+            solrCores.remove(name);
             if (cd != null) {
-              solrCores.remove(cd.getName());
               coresLocator.delete(this, cd);
             }
           } catch (Exception e) {
-            SolrException.log(log, "Failed try to unload failed core:" + cd.getName() + " dir:" + cd.getInstanceDir());
+            SolrException.log(log, "Failed try to unload failed core:" + name + " dir:" + (cd == null ? "null cd" : cd.getInstanceDir()));
           }
           return;
         }
       }
 
-      SolrCore core = null;
+      SolrCore core;
 
       core = solrCores.remove(name);
       if (core != null) {
+        if (cd == null) {
+          cd = core.getCoreDescriptor();
+        }
         try {
           core.getSolrCoreState().cancelRecovery(false, true);
         } catch (Exception e) {
           SolrException.log(log, "Failed canceling recovery for core:" + cd.getName() + " dir:" + cd.getInstanceDir());
         }
-      }
-      if (cd == null) {
-        throw new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]");
+      } else {
+        SolrException ex = new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]");
+        if (isZooKeeperAware()) {
+          log.warn("SolrCore does not exist", ex);
+          return;
+        } else {
+          throw ex;
+        }
       }
 
       // delete metrics specific to this core
@@ -1937,16 +1931,40 @@ public class CoreContainer implements Closeable {
 
       if (core != null) {
         core.unloadOnClose(deleteIndexDir, deleteDataDir);
+      } else {
+        try {
+          SolrCore.deleteUnloadedCore(cd, deleteDataDir, deleteInstanceDir);
+          solrCores.remove(name);
+          if (cd != null) {
+            coresLocator.delete(this, cd);
+          }
+        } catch (Exception e) {
+          SolrException.log(log, "Failed trying to deleteUnloadedCore:" + name + " dir:" + (cd == null ? "null cd" : cd.getInstanceDir()));
+        }
       }
 
       if (core != null) {
         try {
-         core.closeAndWait();
+          core.closeAndWait();
         } catch (Exception e) {
           SolrException.log(log, "Failed closing or waiting for closed core:" + cd.getName() + " dir:" + cd.getInstanceDir());
         }
       }
 
+      if (isZooKeeperAware()) {
+        getZkController().stopReplicationFromLeader(name);
+
+        try {
+          zkSys.getZkController().unregister(name, cd.getCollectionName(), cd.getCloudDescriptor().getShardId());
+        } catch (AlreadyClosedException e) {
+
+        } catch (Exception e) {
+          log.error("Error unregistering core [" + name + "] from cloud state", e);
+          exception = new SolrException(ErrorCode.SERVER_ERROR, "Error unregistering core [" + name + "] from cloud state", e);
+        }
+      }
+
+
       if (exception != null) {
         throw exception;
       }
@@ -2048,17 +2066,17 @@ public class CoreContainer implements Closeable {
     // waitAddPendingCoreOps to createFromDescriptor would introduce a race condition.
 
     // todo: ensure only transient?
-    if (core == null && desc != null) {
-      // nocommit - this does not seem right - should stop a core from loading on startup, before zk reg, not from getCore ...
-      //      if (isZooKeeperAware()) {
-      //        zkSys.getZkController().throwErrorIfReplicaReplaced(desc);
-      //      }
-
-      // nocommit: this can recreate a core when it's not transient - no good!
-      if (desc.isTransient() || !desc.isLoadOnStartup()) {
-        core = createFromDescriptor(desc, false); // This should throw an error if it fails.
-      }
-    }
+//    if (core == null && desc != null) {
+//      // nocommit - this does not seem right - should stop a core from loading on startup, before zk reg, not from getCore ...
+//      //      if (isZooKeeperAware()) {
+//      //        zkSys.getZkController().throwErrorIfReplicaReplaced(desc);
+//      //      }
+//
+//      // nocommit: this can recreate a core when it's not transient - no good!
+//      if (desc.isTransient() || !desc.isLoadOnStartup()) {
+//        core = createFromDescriptor(desc, false); // This should throw an error if it fails.
+//      }
+//    }
     return core;
   }
 
@@ -2168,7 +2186,7 @@ public class CoreContainer implements Closeable {
   }
 
   public boolean isZooKeeperAware() {
-    return isZkAware && zkSys != null && zkSys.zkController != null;
+    return zkSys != null && zkSys.zkController != null;
   }
 
   public ZkController getZkController() {
diff --git a/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java b/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java
index 4e4e2f8..3f9d586 100644
--- a/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java
+++ b/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java
@@ -171,8 +171,8 @@ public class CorePropertiesLocator implements CoresLocator {
       log.info("Found {} core definitions underneath {}", cds.size(), rootDirectory);
     }
     if (cds.size() > 0) {
-      if (log.isInfoEnabled()) {
-        log.info("Cores are: {}", cds.stream().map(CoreDescriptor::getName).collect(Collectors.toList()));
+      if (log.isDebugEnabled()) {
+        log.debug("Cores are: {}", cds.stream().map(CoreDescriptor::getName).collect(Collectors.toList()));
       }
     }
     return cds;
diff --git a/solr/core/src/java/org/apache/solr/core/SolrCore.java b/solr/core/src/java/org/apache/solr/core/SolrCore.java
index 29bc5bc..b70c03a 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrCore.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrCore.java
@@ -173,6 +173,7 @@ import java.util.concurrent.Callable;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
@@ -362,7 +363,7 @@ public final class SolrCore implements SolrInfoBean, Closeable {
       return;
     }
 
-    log.info("Set latest schema for core={} schema={}", getName(), replacementSchema);
+    if (log.isDebugEnabled()) log.debug("Set latest schema for core={} schema={}", getName(), replacementSchema);
 
     this.schema = replacementSchema;
 
@@ -900,8 +901,7 @@ public final class SolrCore implements SolrInfoBean, Closeable {
       log.debug("{}Solr index directory '{}' doesn't exist. Creating new index...", logid, indexDir);
 
       try (SolrIndexWriter writer = SolrIndexWriter.buildIndexWriter(this, "SolrCore.initIndex", indexDir, getDirectoryFactory(),
-              true, getLatestSchema(), solrConfig.indexConfig, solrDelPolicy, codec)) {
-        writer.commit();
+              true, getLatestSchema(), solrConfig.indexConfig, solrDelPolicy, codec, true)) {
       } catch (Exception e) {
         ParWork.propagateInterrupt(e);
         throw new SolrException(ErrorCode.SERVER_ERROR, e);
@@ -1228,15 +1228,15 @@ public final class SolrCore implements SolrInfoBean, Closeable {
       searcherReadyLatch.countDown();
 
       // nocommit - wait before publish active
-//      if (!getSolrConfig().useColdSearcher) {
-//        try {
-//          initSearcherFuture[0].get();
-//        } catch (InterruptedException e) {
-//          log.error("", e);
-//        } catch (ExecutionException e) {
-//          log.error("", e);
-//        }
-//      }
+      if (!getSolrConfig().useColdSearcher) {
+        try {
+          initSearcherFuture[0].get();
+        } catch (InterruptedException e) {
+          log.error("", e);
+        } catch (ExecutionException e) {
+          log.error("", e);
+        }
+      }
     }
 
 
@@ -1295,7 +1295,7 @@ public final class SolrCore implements SolrInfoBean, Closeable {
     }
     Future[] waitSearcher = new Future[1];
     try {
-      getSearcher(false, false, null, true);
+      getSearcher(false, false, waitSearcher, true);
     } finally {
       newReaderCreator = null;
       if (iwRef != null) {
@@ -1643,7 +1643,6 @@ public final class SolrCore implements SolrInfoBean, Closeable {
 //    }
 
     if (log.isDebugEnabled()) log.debug("open refcount {} {}", this, cnt);
-    MDCLoggingContext.setCore(this);
   }
 
   /**
diff --git a/solr/core/src/java/org/apache/solr/core/SolrCores.java b/solr/core/src/java/org/apache/solr/core/SolrCores.java
index 839d464..96b2566 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrCores.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrCores.java
@@ -115,8 +115,8 @@ class SolrCores implements Closeable {
     }
 
     cores.forEach((s, solrCore) -> {
-      container.solrCoreCloseExecutor.submit(() -> {
-        MDCLoggingContext.setCore(solrCore);
+      container.solrCoreExecutor.submit(() -> {
+        MDCLoggingContext.setCoreName(solrCore.getName());
         try {
           solrCore.closeAndWait();
         } catch (Throwable e) {
@@ -213,6 +213,7 @@ class SolrCores implements Closeable {
       set.addAll(getTransientCacheHandler().getAllCoreNames());
     }
     set.addAll(residentDesciptors.keySet());
+    set.addAll(currentlyLoadingCores);
 
     return set;
   }
@@ -266,6 +267,10 @@ class SolrCores implements Closeable {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Cannot unload non-existent core [null]");
     }
 
+    if (!closed) {
+      waitForLoadingCoreToFinish(name, 5000);
+    }
+
     if (log.isDebugEnabled()) log.debug("remove core from solrcores {}", name);
     currentlyLoadingCores.remove(name);
     SolrCore ret = cores.remove(name);
@@ -281,7 +286,9 @@ class SolrCores implements Closeable {
 
   /* If you don't increment the reference count, someone could close the core before you use it. */
   SolrCore getCoreFromAnyList(String name) {
-    waitForLoadingCoreToFinish(name, 15000);
+    if (!closed) {
+      waitForLoadingCoreToFinish(name, 5000);
+    }
     CoreDescriptor cd = residentDesciptors.get(name);
 
     SolrCore core = cores.get(name);
@@ -337,7 +344,9 @@ class SolrCores implements Closeable {
   public CoreDescriptor getCoreDescriptor(String coreName) {
     if (coreName == null) return null;
 
-    waitForLoadingCoreToFinish(coreName, 15000);
+    if (!closed) {
+      waitForLoadingCoreToFinish(coreName, 5000);
+    }
 
     CoreDescriptor cd = residentDesciptors.get(coreName);
     if (cd != null) {
@@ -387,7 +396,7 @@ class SolrCores implements Closeable {
       while (!currentlyLoadingCores.isEmpty()) {
         synchronized (loadingSignal) {
           try {
-            loadingSignal.wait(1000);
+            loadingSignal.wait(500);
           } catch (InterruptedException e) {
             return;
           }
diff --git a/solr/core/src/java/org/apache/solr/core/ZkContainer.java b/solr/core/src/java/org/apache/solr/core/ZkContainer.java
index 08916cb..60be50b 100644
--- a/solr/core/src/java/org/apache/solr/core/ZkContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/ZkContainer.java
@@ -202,7 +202,7 @@ public class ZkContainer implements Closeable {
     log.info("Register in ZooKeeper core={} liveNodes={}", core.getName(), zkController.getZkStateReader().getLiveNodes());
     CoreDescriptor cd = core.getCoreDescriptor(); // save this here - the core may not have it later
     Runnable r = () -> {
-        MDCLoggingContext.setCore(core);
+        MDCLoggingContext.setCoreName(core.getName());
         try {
           try {
             if (testing_beforeRegisterInZk != null) {
diff --git a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
index 6ac1139..4b49a57 100644
--- a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
+++ b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
@@ -18,6 +18,7 @@ package org.apache.solr.handler;
 
 import com.google.common.base.Strings;
 import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.IndexCommit;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.SegmentInfos;
@@ -44,10 +45,8 @@ import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
-import org.apache.solr.common.util.ExecutorUtil;
 import org.apache.solr.common.util.FastInputStream;
 import org.apache.solr.common.util.NamedList;
-import org.apache.solr.common.util.SolrNamedThreadFactory;
 import org.apache.solr.common.util.SuppressForbidden;
 import org.apache.solr.core.DirectoryFactory;
 import org.apache.solr.core.DirectoryFactory.DirContext;
@@ -522,7 +521,7 @@ public class IndexFetcher {
       }
 
       // Create the sync service
-      fsyncService = ExecutorUtil.newMDCAwareSingleThreadExecutor(new SolrNamedThreadFactory("fsyncService"));
+      fsyncService = ParWork.getExecutorService(4);
       // use a synchronized list because the list is read by other threads (to show details)
       filesDownloaded = Collections.synchronizedList(new ArrayList<Map<String, Object>>());
       // if the generation of master is older than that of the slave , it means they are not compatible to be copied
@@ -726,7 +725,7 @@ public class IndexFetcher {
     ZkController zkController = solrCore.getCoreContainer().getZkController();
     CloudDescriptor cd = solrCore.getCoreDescriptor().getCloudDescriptor();
     Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(
-        cd.getCollectionName(), cd.getShardId(), 1500, false);
+        cd.getCollectionName(), cd.getShardId(), 3000, false);
     return leaderReplica;
   }
 
@@ -812,7 +811,6 @@ public class IndexFetcher {
    * terminate the fsync service and wait for all the tasks to complete. If it is already terminated
    */
   private void terminateAndWaitFsyncService() throws Exception {
-    if (fsyncServiceFuture == null || fsyncService.isTerminated()) return;
     fsyncService.shutdown();
     // give a long wait say 1 hr
     fsyncService.awaitTermination(3600, TimeUnit.SECONDS);
@@ -1058,7 +1056,7 @@ public class IndexFetcher {
       log.warn("WARNING: clearing disk space ahead of time to avoid running out of space, could cause problems with current SolrCore approxTotalSpaceReqd{}, usableSpace={}", atsr, usableSpace);
       deleteFilesInAdvance(indexDir, indexDirPath, totalSpaceRequired, usableSpace);
     }
-    log.info("Files to download {}", filesToDownload);
+    if (log.isDebugEnabled()) log.debug("Files to download {}", filesToDownload);
     try {
       // nocommit
       try (ParWork parWork = new ParWork(this, true)) {
@@ -1116,7 +1114,7 @@ public class IndexFetcher {
               if (stop) {
                 throw new AlreadyClosedException();
               }
-              log.info("Downloaded {}", tmpIndexDir, file.get(NAME));
+              if (log.isDebugEnabled()) log.debug("Downloaded {}", tmpIndexDir, file.get(NAME));
               filesDownloaded.add(Collections.unmodifiableMap(file));
             } else {
               if (log.isDebugEnabled()) {
@@ -1234,8 +1232,13 @@ public class IndexFetcher {
           try {
             indexFileChecksum = CodecUtil.retrieveChecksum(indexInput);
             compareResult.checkSummed = true;
+          } catch (CorruptIndexException e) {
+            log.warn("Could not retrieve checksum from file.", e.getMessage());
+            compareResult.equal = false;
+            return compareResult;
           } catch (Exception e) {
             log.warn("Could not retrieve checksum from file.", e);
+            compareResult.equal = false;
           }
         }
 
@@ -1722,11 +1725,10 @@ public class IndexFetcher {
         throw e;
       } finally {
         cleanup(null);
-        //if cleanup succeeds . The file is downloaded fully. do an fsync
+        //if cleanup succeeds . The file is downloaded fully
         fsyncServiceFuture = fsyncService.submit(() -> {
           try {
-            log.info("Sync and close fetched file", file);
-            file.sync();
+            file.close();
           } catch (Exception e) {
             fsyncException = e;
           }
diff --git a/solr/core/src/java/org/apache/solr/handler/RequestHandlerBase.java b/solr/core/src/java/org/apache/solr/handler/RequestHandlerBase.java
index 8670c17..a9491b6 100644
--- a/solr/core/src/java/org/apache/solr/handler/RequestHandlerBase.java
+++ b/solr/core/src/java/org/apache/solr/handler/RequestHandlerBase.java
@@ -303,7 +303,7 @@ public abstract class RequestHandlerBase implements SolrRequestHandler, SolrInfo
    * This function is thread safe.
    */
   public static SolrRequestHandler getRequestHandler(String handlerName, PluginBag<SolrRequestHandler> reqHandlers) {
-    if (log.isDebugEnabled()) log.debug("get request handler {} from {}", reqHandlers);
+    if (log.isDebugEnabled()) log.debug("get request handler {} from {}", handlerName, reqHandlers);
     if (handlerName == null) return null;
     SolrRequestHandler handler = reqHandlers.get(handlerName);
     int idx = 0;
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java
index 024936a..0e4f66f 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java
@@ -412,13 +412,12 @@ public class CollectionsHandler extends RequestHandlerBase implements Permission
             + event.getWatchedEvent().getState() + " type "
             + event.getWatchedEvent().getType() + "]");
       } else {
-        // nocommit - look into we may still need this
-        // we have to assume success - it was too quick for us to catch the response
+        // TODO: we could do a check based on the request to see how it turned out
 
-        log.error("We did not find the response, there was also no timeout and we did not get a watched event ...");
+        log.error("The Overseer stopped and we don't know if this was a success ...");
 
         NamedList<Object> resp = new NamedList<>();
-        resp.add("success", "true");
+        resp.add("success", "unknown");
         return new OverseerSolrResponse(resp);
       }
     }
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminOperation.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminOperation.java
index 935ee4a..cc60d46 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminOperation.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminOperation.java
@@ -22,6 +22,7 @@ import java.nio.file.Path;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Optional;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.commons.lang3.StringUtils;
 import org.apache.solr.cloud.CloudDescriptor;
@@ -78,6 +79,7 @@ enum CoreAdminOperation implements CoreAdminOp {
     String coreName = params.required().get(CoreAdminParams.NAME);
     MDCLoggingContext.setCoreName(coreName);
     try {
+
       assert TestInjection.injectRandomDelayInCoreCreation();
 
       Map<String,String> coreParams = buildCoreParams(params);
@@ -99,8 +101,9 @@ enum CoreAdminOperation implements CoreAdminOp {
         log().warn("Will not create SolrCore, CoreContainer is shutdown");
         throw new AlreadyClosedException("Will not create SolrCore, CoreContainer is shutdown");
       }
-
+      long start = System.nanoTime();
       coreContainer.create(coreName, instancePath, coreParams, newCollection);
+      log().info("SolrCore {} created in {}ms", coreName, TimeUnit.NANOSECONDS.convert(System.nanoTime() - start, TimeUnit.MILLISECONDS));
 
       it.rsp.add("core", coreName);
     } finally {
@@ -283,6 +286,8 @@ enum CoreAdminOperation implements CoreAdminOp {
     }
   });
 
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
   final CoreAdminParams.CoreAdminAction action;
   final CoreAdminOp fun;
 
@@ -291,7 +296,7 @@ enum CoreAdminOperation implements CoreAdminOp {
     this.fun = fun;
   }
 
-  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
 
   static Logger log() {
     return log;
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
index ca07d0b..d8dfa8e 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
@@ -17,6 +17,7 @@
 
 package org.apache.solr.handler.admin;
 
+import org.apache.solr.cloud.LeaderElector;
 import org.apache.solr.cloud.ZkController.NotInClusterStateException;
 import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrException.ErrorCode;
@@ -60,6 +61,11 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
         "Going to wait for core: {}, state: {}: params={}",
         cname, waitForState, params);
 
+    LeaderElector leaderElector = it.handler.coreContainer.getZkController().getLeaderElector(cname);
+    if (leaderElector == null || !leaderElector.isLeader()) {
+      throw new IllegalStateException("Not the valid leader " + (leaderElector == null ? "No leader elector" : "Elector state=" + leaderElector.getState()));
+    }
+
     assert TestInjection.injectPrepRecoveryOpPauseForever();
 
     CoreContainer coreContainer = it.handler.coreContainer;
@@ -67,10 +73,9 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
     AtomicReference<String> errorMessage = new AtomicReference<>();
 
     try {
-      coreContainer.getZkController().getZkStateReader().waitForState(collection, 5, TimeUnit.SECONDS, (n, c) -> {
+      coreContainer.getZkController().getZkStateReader().waitForState(collection, 10, TimeUnit.SECONDS, (n, c) -> {
         if (c == null) {
-          log.info("collection not found {}", collection);
-          return false;
+          return true;
         }
 
         // wait until we are sure the recovering node is ready
@@ -80,8 +85,7 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
         if (replica != null) {
           isLive = coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName());
           if (replica.getState() == waitForState) {
-            // if (log.isDebugEnabled()) log.debug("replica={} state={} waitForState={}", replica, replica.getState(), waitForState);
-            log.info("replica={} state={} waitForState={} isLive={}", replica, replica.getState(), waitForState, coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName()));
+            if (log.isDebugEnabled()) log.debug("replica={} state={} waitForState={} isLive={}", replica, replica.getState(), waitForState, coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName()));
             return true;
           }
         }
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/SplitOp.java b/solr/core/src/java/org/apache/solr/handler/admin/SplitOp.java
index 59e7247..c44ef8f 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/SplitOp.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/SplitOp.java
@@ -156,6 +156,7 @@ class SplitOp implements CoreAdminHandler.CoreAdminOp {
 
           if (newcore == null) {
             it.handler.coreContainer.waitForLoadingCore(newCoreName, 10000);
+            // above currently done in getCore, but shorter
             newcore = it.handler.coreContainer.getCore(newCoreName);
           }
 
diff --git a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
index 63eff4b..88f9747 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
@@ -48,6 +48,7 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.solr.client.solrj.SolrResponse;
 import org.apache.solr.cloud.CloudDescriptor;
+import org.apache.solr.cloud.LeaderElector;
 import org.apache.solr.cloud.ZkController;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentBase;
@@ -136,15 +137,18 @@ public class RealTimeGetComponent extends SearchComponent
     if (!params.getBool(COMPONENT_NAME, true)) {
       return;
     }
-    
-    // This seems rather kludgey, may there is better way to indicate
-    // that replica can support handling version ranges
-    String val = params.get("checkCanHandleVersionRanges");
-    if(val != null) {
-      rb.rsp.add("canHandleVersionRanges", true);
+
+    String val = params.get("onlyIfLeader");
+    if (val != null && req.getCore().getCoreContainer().isZooKeeperAware()) {
+      LeaderElector leaderElector = req.getCore().getCoreContainer().getZkController().getLeaderElector(req.getCore().getName());
+      if (leaderElector == null || !leaderElector.isLeader()) {
+        throw new IllegalStateException("Not the valid leader");
+      }
+
       return;
     }
-    
+
+
     val = params.get("getFingerprint");
     if(val != null) {
       processGetFingeprint(rb);
@@ -356,7 +360,7 @@ public class RealTimeGetComponent extends SearchComponent
     if (idStr == null) return;
     AtomicLong version = new AtomicLong();
     SolrInputDocument doc = getInputDocument(req.getCore(), new BytesRef(idStr), version, null, Resolution.DOC);
-    log.info("getInputDocument called for id={}, returning {}", idStr, doc);
+    if (log.isDebugEnabled()) log.debug("getInputDocument called for id={}, returning {}", idStr, doc);
     rb.rsp.add("inputDocument", doc);
     rb.rsp.add("version", version.get());
   }
@@ -970,7 +974,7 @@ public class RealTimeGetComponent extends SearchComponent
     // the mappings.
 
     for (int i=0; i<rb.slices.length; i++) {
-      log.info("LOOKUP_SLICE:{}={}", rb.slices[i], rb.shards[i]);
+      if (log.isDebugEnabled()) log.debug("LOOKUP_SLICE:{}={}", rb.slices[i], rb.shards[i]);
       if (lookup.equals(rb.slices[i]) || slice.equals(rb.slices[i])) {
         return new String[]{rb.shards[i]};
       }
@@ -1189,6 +1193,7 @@ public class RealTimeGetComponent extends SearchComponent
 
     // TODO: get this from cache instead of rebuilding?
     try (UpdateLog.RecentUpdates recentUpdates = ulog.getRecentUpdates()) {
+      if (log.isDebugEnabled()) log.debug("Get updates  versionsRequested={} params={}", versions.size(), params);
       for (Long version : versions) {
         try {
           Object o = recentUpdates.lookup(version);
diff --git a/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java b/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java
index 8110027..51ca9a1 100644
--- a/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java
+++ b/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java
@@ -1060,7 +1060,7 @@ public class SolrMetricManager {
         new Object[]{this, registry}
     );
     // prepare MDC for plugins that want to use its properties
-    MDCLoggingContext.setCoreDescriptor(coreContainer, solrCore == null ? null : solrCore.getCoreDescriptor());
+    MDCLoggingContext.setCoreName(solrCore == null ? null : solrCore.getName());
     if (tag != null) {
       // add instance tag to MDC
       MDC.put("tag", "t:" + tag);
diff --git a/solr/core/src/java/org/apache/solr/pkg/PackageListeners.java b/solr/core/src/java/org/apache/solr/pkg/PackageListeners.java
index a082664..54b12ac 100644
--- a/solr/core/src/java/org/apache/solr/pkg/PackageListeners.java
+++ b/solr/core/src/java/org/apache/solr/pkg/PackageListeners.java
@@ -61,7 +61,7 @@ public class PackageListeners {
   }
 
   void packagesUpdated(List<PackageLoader.Package> pkgs) {
-    MDCLoggingContext.setCore(core);
+    MDCLoggingContext.setCoreName(core.getName());
     try {
       for (PackageLoader.Package pkgInfo : pkgs) {
         invokeListeners(pkgInfo);
diff --git a/solr/core/src/java/org/apache/solr/schema/IndexSchema.java b/solr/core/src/java/org/apache/solr/schema/IndexSchema.java
index a54a6b6..e7c192a 100644
--- a/solr/core/src/java/org/apache/solr/schema/IndexSchema.java
+++ b/solr/core/src/java/org/apache/solr/schema/IndexSchema.java
@@ -732,15 +732,13 @@ public class IndexSchema {
             + f.getName() + "' [[["+old.toString()+"]]] and [[["+f.toString()+"]]]";
           throw new SolrException(ErrorCode.SERVER_ERROR, msg );
         }
-        log.debug("field defined: {}", f);
+        if (log.isTraceEnabled()) log.trace("field defined: {}", f);
         if( f.getDefaultValue() != null ) {
-          if (log.isDebugEnabled()) {
-            log.debug("{} contains default value {}", name, f.getDefaultValue());
-          }
+          if (log.isTraceEnabled()) log.trace("{} contains default value {}", name, f.getDefaultValue());
           fieldsWithDefaultValue.add( f );
         }
         if (f.isRequired()) {
-          log.debug("{} is required in this schema", name);
+          if (log.isTraceEnabled()) log.trace("{} is required in this schema", name);
           requiredFields.add(f);
         }
       } else if (nodeValue.equals(DYNAMIC_FIELD)) {
@@ -874,7 +872,7 @@ public class IndexSchema {
 
   private void addDynamicFieldNoDupCheck(List<DynamicField> dFields, SchemaField f) {
     dFields.add(new DynamicField(f));
-    log.debug("dynamic field defined: {}", f);
+    if (log.isTraceEnabled()) log.trace("dynamic field defined: {}", f);
   }
 
   protected boolean isDuplicateDynField(List<DynamicField> dFields, SchemaField f) {
diff --git a/solr/core/src/java/org/apache/solr/schema/ManagedIndexSchema.java b/solr/core/src/java/org/apache/solr/schema/ManagedIndexSchema.java
index e79fc69..a08225a 100644
--- a/solr/core/src/java/org/apache/solr/schema/ManagedIndexSchema.java
+++ b/solr/core/src/java/org/apache/solr/schema/ManagedIndexSchema.java
@@ -208,7 +208,7 @@ public final class ManagedIndexSchema extends IndexSchema {
           if (stat != null) {
             found = stat.getVersion();
           }
-          log.info("Bad version when trying to persist schema using {} found {} schema {}", ver, found, this);
+          if (log.isDebugEnabled()) log.debug("Bad version when trying to persist schema using {} found {} schema {}", ver, found, this);
 
           schemaChangedInZk = true;
         }
@@ -223,7 +223,7 @@ public final class ManagedIndexSchema extends IndexSchema {
     }
     if (schemaChangedInZk) {
       String msg = "Failed to persist managed schema at " + managedSchemaPath + " - version mismatch";
-      log.info(msg);
+      if (log.isDebugEnabled()) log.debug(msg);
       throw new SchemaChangedInZkException(ErrorCode.CONFLICT, msg + ", retry.");
     }
 
diff --git a/solr/core/src/java/org/apache/solr/schema/ZkIndexSchemaReader.java b/solr/core/src/java/org/apache/solr/schema/ZkIndexSchemaReader.java
index c942a4a..83245aa 100644
--- a/solr/core/src/java/org/apache/solr/schema/ZkIndexSchemaReader.java
+++ b/solr/core/src/java/org/apache/solr/schema/ZkIndexSchemaReader.java
@@ -98,7 +98,7 @@ public class ZkIndexSchemaReader implements OnReconnect {
    *
    */
   public void createSchemaWatcher() {
-    log.info("Creating ZooKeeper watch for the managed schema at {}", managedSchemaPath);
+    if (log.isDebugEnabled()) log.debug("Creating ZooKeeper watch for the managed schema at {}", managedSchemaPath);
     IOUtils.closeQuietly(schemaWatcher);
     schemaWatcher = new SchemaWatcher(this);
   }
@@ -132,6 +132,8 @@ public class ZkIndexSchemaReader implements OnReconnect {
     public void close() throws IOException {
       try {
         schemaReader.zkClient.getSolrZooKeeper().removeWatches(schemaReader.managedSchemaPath, this, WatcherType.Any, true);
+      } catch (KeeperException.NoWatcherException e) {
+
       } catch (Exception e) {
         if (log.isDebugEnabled()) log.debug("could not remove watch {} {}", e.getClass().getSimpleName(), e.getMessage());
       }
@@ -162,10 +164,10 @@ public class ZkIndexSchemaReader implements OnReconnect {
 
       v = managedIndexSchemaFactory.getSchema().getSchemaZkVersion();
 
-      log.info("Retrieved schema version {} from Zookeeper, existing={} schema={}", existsVersion, v, managedIndexSchemaFactory.getSchema());
+      if (log.isDebugEnabled()) log.debug("Retrieved schema version {} from Zookeeper, existing={} schema={}", existsVersion, v, managedIndexSchemaFactory.getSchema());
 
       if (v >= existsVersion) {
-        log.info("Old schema version {} is >= found version {}", v, existsVersion);
+        if (log.isDebugEnabled()) log.debug("Old schema version {} is >= found version {}", v, existsVersion);
 
         return null;
       }
diff --git a/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java b/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java
index 790a3eb..0421f6b 100644
--- a/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java
+++ b/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java
@@ -257,14 +257,16 @@ public class HttpSolrCall {
         path = path.substring(idx2);
       }
 
-      cores.waitForLoadingCore(origCorename, 15000);
-      // the core may have just finished loading
-
       // Try to resolve a Solr core name
       core = cores.getCore(origCorename);
 
       if (log.isDebugEnabled()) log.debug("tried to get core by name {} got {}, existing cores {} found={}", origCorename, core, cores.getAllCoreNames(), core != null);
 
+//      if (core == null) {
+//        // nocommit
+//        log.info("tried to get core by name {} got {}, existing cores {} found={}", origCorename, core, cores.getAllCoreNames(), core != null);
+//      }
+
       if (core != null) {
         path = path.substring(idx);
         if (log.isDebugEnabled()) log.debug("Path is parsed as {}", path);
@@ -281,7 +283,6 @@ public class HttpSolrCall {
       }
     }
 
-
     if (cores.isZooKeeperAware()) {
       // init collectionList (usually one name but not when there are aliases)
       String def = core != null ? core.getCoreDescriptor().getCollectionName() : origCorename;
@@ -555,8 +556,9 @@ public class HttpSolrCall {
     if (activeSpan != null) {
       MDCLoggingContext.setTracerId(activeSpan.context().toTraceId());
     }
-
-    MDCLoggingContext.setNode(cores);
+    if (cores.isZooKeeperAware()) {
+      MDCLoggingContext.setNode(cores.getZkController().getNodeName());
+    }
 
     if (cores == null) {
       sendError(503, "Server is shutting down or failed to initialize");
diff --git a/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java b/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java
index 7a782ea..0799598 100644
--- a/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java
+++ b/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java
@@ -79,6 +79,7 @@ import org.apache.solr.core.SolrCore;
 import org.apache.solr.core.SolrInfoBean;
 import org.apache.solr.core.SolrPaths;
 import org.apache.solr.core.SolrXmlConfig;
+import org.apache.solr.logging.MDCLoggingContext;
 import org.apache.solr.metrics.AltBufferPoolMetricSet;
 import org.apache.solr.metrics.MetricsMap;
 import org.apache.solr.metrics.SolrMetricManager;
@@ -95,6 +96,7 @@ import org.apache.zookeeper.KeeperException;
 import org.eclipse.jetty.client.HttpClient;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.slf4j.MDC;
 
 import static org.apache.solr.security.AuditEvent.EventType;
 
@@ -403,7 +405,14 @@ public class SolrDispatchFilter extends BaseSolrFilter {
   
   @Override
   public void destroy() {
-    close();
+    if (cores.isZooKeeperAware())  {
+      MDCLoggingContext.setNode(cores.getZkController().getNodeName());
+    }
+    try {
+      close();
+    } finally {
+      MDCLoggingContext.clear();
+    }
   }
   
   public void close() {
diff --git a/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java b/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java
index f6b6f13..3710b54 100644
--- a/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java
+++ b/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java
@@ -65,49 +65,11 @@ public class SolrQoSFilter extends QoSFilter {
     boolean imagePath = req.getPathInfo() != null && req.getPathInfo().startsWith("/img/");
     boolean externalRequest = !imagePath && (source == null || !source.equals(QoSParams.INTERNAL));
     if (log.isDebugEnabled()) log.debug("SolrQoSFilter {} {} {}", sysStats.getSystemLoad(), sysStats.getTotalUsage(), externalRequest);
+    //log.info("SolrQoSFilter {} {} {}", sysStats.getSystemLoad(), sysStats.getTotalUsage(), externalRequest);
 
     if (externalRequest) {
       if (log.isDebugEnabled()) log.debug("external request"); //nocommit: remove when testing is done
-      double ourLoad = sysStats.getTotalUsage();
-      if (log.isDebugEnabled()) log.debug("Our individual load is {}", ourLoad);
-      double sLoad = sysStats.getSystemLoad();
-      if (ourLoad > SysStats.OUR_LOAD_HIGH) {
-
-        int cMax = getMaxRequests();
-        if (cMax > 5) {
-          int max = Math.max(5, (int) ((double) cMax * 0.30D));
-          log.warn("Our individual load is {}", ourLoad);
-          updateMaxRequests(max, sLoad, ourLoad);
-        }
-
-      } else {
-        // nocommit - deal with no supported, use this as a fail safe with high and low watermark?
-        if (ourLoad < 0.90 && sLoad < 1.6 && _origMaxRequests != getMaxRequests()) {
-          if (sLoad < 0.9) {
-            if (log.isDebugEnabled()) log.debug("set max concurrent requests to orig value {}", _origMaxRequests);
-            updateMaxRequests(_origMaxRequests, sLoad, ourLoad);
-          } else {
-            updateMaxRequests(Math.min(_origMaxRequests, getMaxRequests() * 3), sLoad, ourLoad);
-          }
-        } else {
-          if (ourLoad > 0.90 && sLoad > 1.5) {
-            int cMax = getMaxRequests();
-            if (cMax > 5) {
-              int max = Math.max(5, (int) ((double) cMax * 0.30D));
-            //  log.warn("System load is {} and our load is {} procs is {}, set max concurrent requests to {}", sLoad, ourLoad, SysStats.PROC_COUNT, max);
-              updateMaxRequests(max, sLoad, ourLoad);
-            }
-          } else if (ourLoad < 0.90 && sLoad < 2 && _origMaxRequests != getMaxRequests()) {
-            if (sLoad < 0.9) {
-              if (log.isDebugEnabled()) log.debug("set max concurrent requests to orig value {}", _origMaxRequests);
-              updateMaxRequests(_origMaxRequests, sLoad, ourLoad);
-            } else {
-              updateMaxRequests(Math.min(_origMaxRequests, getMaxRequests() * 3), sLoad, ourLoad);
-            }
-
-          }
-        }
-      }
+      checkLoad();
 
       //chain.doFilter(req, response);
       super.doFilter(req, response, chain);
@@ -118,11 +80,68 @@ public class SolrQoSFilter extends QoSFilter {
     }
   }
 
+  private void checkLoad() {
+    double ourLoad = sysStats.getTotalUsage();
+    int currentMaxRequests = getMaxRequests();
+    if (log.isDebugEnabled()) log.debug("Our individual load is {}", ourLoad);
+    double sLoad = sysStats.getSystemLoad();
+
+
+    if (lowStateLoad(sLoad, currentMaxRequests)) {
+//      if (log.isDebugEnabled()) log.debug("set max concurrent requests to orig value {}", _origMaxRequests);
+//      updateMaxRequests(_origMaxRequests, sLoad, ourLoad);
+//    } else {
+      updateMaxRequests(Math.min(_origMaxRequests, _origMaxRequests), sLoad, ourLoad);
+    } else {
+
+      if (hiLoadState(sLoad, currentMaxRequests)) {
+
+        if (currentMaxRequests == _origMaxRequests) {
+          updateMaxRequests(100, sLoad, ourLoad);
+        } else {
+          updateMaxRequests(50, sLoad, ourLoad);
+        }
+      }
+    }
+      // nocommit - deal with no supported, use this as a fail safe with high and low watermark?
+  }
+
+  private boolean lowStateLoad(double sLoad, int currentMaxRequests) {
+    return currentMaxRequests < _origMaxRequests && sLoad < .95d;
+  }
+
+  private boolean hiLoadState(double sLoad, int currentMaxRequests) {
+    return sLoad > 0.95d;
+  }
+
   private void updateMaxRequests(int max, double sLoad, double ourLoad) {
-    if (System.currentTimeMillis() - lastUpdate > 2000) {
+    int currentMax = getMaxRequests();
+    if (max < currentMax) {
+      if (System.currentTimeMillis() - lastUpdate > 500) {
+        log.warn("Set max request to {} sload={} ourload={}", max, sLoad, ourLoad);
+        lastUpdate = System.currentTimeMillis();
+        setMaxRequests(max);
+      }
+    } else if (max > currentMax) {
+
       log.warn("Set max request to {} sload={} ourload={}", max, sLoad, ourLoad);
       lastUpdate = System.currentTimeMillis();
       setMaxRequests(max);
     }
+
+  }
+
+  protected int getPriority(ServletRequest request)
+  {
+    HttpServletRequest baseRequest = (HttpServletRequest)request;
+
+    String pathInfo = baseRequest.getPathInfo();
+    log.info("pathInfo={}", pathInfo);
+
+    if (pathInfo != null && pathInfo.equals("/admin/collections")) {
+      return 5;
+    }
+
+    return 0;
   }
 }
\ No newline at end of file
diff --git a/solr/core/src/java/org/apache/solr/servlet/StopJetty.java b/solr/core/src/java/org/apache/solr/servlet/StopJetty.java
index 869e771..250b858 100644
--- a/solr/core/src/java/org/apache/solr/servlet/StopJetty.java
+++ b/solr/core/src/java/org/apache/solr/servlet/StopJetty.java
@@ -48,7 +48,7 @@ public class StopJetty {
         out.flush();
         if (timeout > 0)
         {
-          System.err.printf("Waiting %,d seconds for jetty to stop%n",timeout);
+          System.err.printf("Waiting %,d seconds for Solr to stop%n",timeout);
           LineNumberReader lin = new LineNumberReader(new InputStreamReader(s.getInputStream()));
           String response;
           while ((response = lin.readLine()) != null)
diff --git a/solr/core/src/java/org/apache/solr/update/AddUpdateCommand.java b/solr/core/src/java/org/apache/solr/update/AddUpdateCommand.java
index e192fe9..8c945f0 100644
--- a/solr/core/src/java/org/apache/solr/update/AddUpdateCommand.java
+++ b/solr/core/src/java/org/apache/solr/update/AddUpdateCommand.java
@@ -43,7 +43,7 @@ public class AddUpdateCommand extends UpdateCommand {
    * Higher level SolrInputDocument, normally used to construct the Lucene Document(s)
    * to index.
    */
-  public SolrInputDocument solrDoc;
+  public volatile SolrInputDocument solrDoc;
 
   /**
    * This is the version of a document, previously indexed, on which the current
@@ -51,7 +51,7 @@ public class AddUpdateCommand extends UpdateCommand {
    * or a full update. A negative value here, e.g. -1, indicates that this add
    * update does not depend on a previous update.
    */
-  public long prevVersion = -1;
+  public volatile long prevVersion = -1;
 
   public boolean overwrite = true;
 
@@ -62,14 +62,14 @@ public class AddUpdateCommand extends UpdateCommand {
 
   public int commitWithin = -1;
 
-  public boolean isLastDocInBatch = false;
+  public volatile boolean isLastDocInBatch = false;
 
   /** Is this a nested update, null means not yet calculated. */
-  public Boolean isNested = null;
+  public volatile Boolean isNested = null;
 
   // optional id in "internal" indexed form... if it is needed and not supplied,
   // it will be obtained from the doc.
-  private BytesRef indexedId;
+  private volatile BytesRef indexedId;
 
   public AddUpdateCommand(SolrQueryRequest req) {
     super(req);
diff --git a/solr/core/src/java/org/apache/solr/update/CommitTracker.java b/solr/core/src/java/org/apache/solr/update/CommitTracker.java
index 3c28886..e1ca402 100644
--- a/solr/core/src/java/org/apache/solr/update/CommitTracker.java
+++ b/solr/core/src/java/org/apache/solr/update/CommitTracker.java
@@ -294,7 +294,7 @@ public final class CommitTracker implements Runnable, Closeable {
       lock.unlock();
     }
 
-    MDCLoggingContext.setCore(core);
+    MDCLoggingContext.setCoreName(core.getName());
     try (SolrQueryRequest req = new LocalSolrQueryRequest(core, new ModifiableSolrParams())) {
       CommitUpdateCommand command = new CommitUpdateCommand(req, false);
       command.openSearcher = openSearcher;
diff --git a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
index 02ecd79..bf3d240 100644
--- a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
+++ b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
@@ -35,6 +35,7 @@ import org.apache.solr.util.RefCounted;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.File;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.concurrent.Callable;
@@ -272,7 +273,7 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
     SolrIndexWriter iw;
     try {
       iw = SolrIndexWriter.buildIndexWriter(core, name, core.getNewIndexDir(), core.getDirectoryFactory(), false, core.getLatestSchema(),
-              core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec());
+              core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec(), false);
     } catch (Exception e) {
       ParWork.propagateInterrupt(e);
       throw new SolrException(ErrorCode.SERVER_ERROR, e);
@@ -318,10 +319,14 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
 
     log.info("Do recovery for core {}", core.getName());
     CoreContainer corecontainer = core.getCoreContainer();
-
+    if (prepForClose || closed || corecontainer.isShutDown()) {
+      log.warn("Skipping recovery because Solr is shutdown");
+      return;
+    }
     Runnable recoveryTask = () -> {
       CoreDescriptor coreDescriptor = core.getCoreDescriptor();
-      MDCLoggingContext.setCoreDescriptor(corecontainer, coreDescriptor);
+      MDCLoggingContext.setCoreName(core.getName());
+      MDCLoggingContext.setNode(corecontainer.getZkController().getNodeName());
       try {
         if (SKIP_AUTO_RECOVERY) {
           log.warn("Skipping recovery according to sys prop solrcloud.skip.autorecovery");
@@ -432,7 +437,7 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
 
   @Override
   public void cancelRecovery(boolean wait, boolean prepForClose) {
-    log.info("Cancel recovery");
+    if (log.isDebugEnabled()) log.debug("Cancel recovery");
     recoverying = false;
     
     if (prepForClose) {
diff --git a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java
index 825afe2..4e92e9e 100644
--- a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java
+++ b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 import java.util.Locale;
 import java.util.concurrent.atomic.AtomicLong;
@@ -221,7 +222,7 @@ public class HdfsUpdateLog extends UpdateLog {
     // TODO: these startingVersions assume that we successfully recover from all
     // non-complete tlogs.
     try (RecentUpdates startingUpdates = getRecentUpdates()) {
-      startingVersions = startingUpdates.getVersions(getNumRecordsToKeep());
+      startingVersions = Collections.unmodifiableList(startingUpdates.getVersions(getNumRecordsToKeep()));
 
       // populate recent deletes list (since we can't get that info from the
       // index)
diff --git a/solr/core/src/java/org/apache/solr/update/PeerSync.java b/solr/core/src/java/org/apache/solr/update/PeerSync.java
index 387f27e..08ca70c 100644
--- a/solr/core/src/java/org/apache/solr/update/PeerSync.java
+++ b/solr/core/src/java/org/apache/solr/update/PeerSync.java
@@ -140,6 +140,7 @@ public class PeerSync implements SolrMetricProducer {
   }
 
   public static long percentile(List<Long> arr, float frac) {
+    if (arr.size() == 0) return 0;
     int elem = (int) (arr.size() * frac);
     return Math.abs(arr.get(elem));
   }
@@ -206,25 +207,8 @@ public class PeerSync implements SolrMetricProducer {
         // we have no versions and hence no frame of reference to tell if we can use a peers
         // updates to bring us into sync
 
-        log.info("{} DONE. We have no versions. sync failed.", msg());
-
-        for (;;)  {
-          if (log.isDebugEnabled()) log.debug("looping in check for versions on others");
-          ShardResponse srsp = shardHandler.takeCompletedIncludingErrors();
-          if (srsp == null) break;
-          if (srsp.getException() == null)  {
-            if (log.isDebugEnabled()) log.debug("checking if others have versions {} {}", srsp.getSolrResponse().getResponse());
-            List<Long> otherVersions = (List<Long>)srsp.getSolrResponse().getResponse().get("versions");
-            if (otherVersions != null && !otherVersions.isEmpty())  {
-              if (syncErrors != null) syncErrors.inc();
-              if (log.isDebugEnabled()) log.debug("found another replica with versions");
-              return PeerSyncResult.failure(true);
-            }
-          }
-        }
-        if (syncErrors != null) syncErrors.inc();
-        if (log.isDebugEnabled()) log.debug("found no other replica with versions");
-        return PeerSyncResult.failure(false);
+        return failOnNoVersions();
+
       }
 
       MissedUpdatesFinder missedUpdatesFinder = new MissedUpdatesFinder(ourUpdates, msg(), nUpdates, ourLowThreshold, ourHighThreshold);
@@ -265,6 +249,27 @@ public class PeerSync implements SolrMetricProducer {
     }
   }
 
+  private PeerSyncResult failOnNoVersions() {
+    log.info("{} DONE. We have no versions. sync failed.", msg());
+
+    for (;;)  {
+      ShardResponse srsp = shardHandler.takeCompletedIncludingErrors();
+      if (srsp == null) break;
+      if (srsp.getException() == null)  {
+        if (log.isDebugEnabled()) log.debug("checking if others have versions {} {}", srsp.getSolrResponse().getResponse());
+        List<Long> otherVersions = (List<Long>)srsp.getSolrResponse().getResponse().get("versions");
+        if (otherVersions != null && !otherVersions.isEmpty())  {
+          if (syncErrors != null) syncErrors.inc();
+          if (log.isDebugEnabled()) log.debug("found another replica with versions");
+          return PeerSyncResult.failure(true);
+        }
+      }
+    }
+    if (syncErrors != null) syncErrors.inc();
+    if (log.isDebugEnabled()) log.debug("found no other replica with versions");
+    return PeerSyncResult.failure(false);
+  }
+
   /**
    * Check if we are already in sync. Simple fingerprint comparison should do
    */
@@ -406,31 +411,6 @@ public class PeerSync implements SolrMetricProducer {
     }
   }
 
-  private boolean canHandleVersionRanges(String replica) {
-    SyncShardRequest sreq = new SyncShardRequest();
-    requests.add(sreq);
-
-    // determine if leader can handle version ranges
-    sreq.shards = new String[] {replica};
-    sreq.actualShards = sreq.shards;
-    sreq.params = new ModifiableSolrParams();
-    sreq.params.set("qt", "/get");
-    sreq.params.set(DISTRIB, false);
-    sreq.params.set("checkCanHandleVersionRanges", false);
-
-    ShardHandler sh = shardHandlerFactory.getShardHandler();
-    sh.submit(sreq, replica, sreq.params);
-
-    ShardResponse srsp = sh.takeCompletedIncludingErrors();
-    Boolean canHandleVersionRanges = srsp.getSolrResponse().getResponse().getBooleanArg("canHandleVersionRanges");
-
-    if (canHandleVersionRanges == null || canHandleVersionRanges.booleanValue() == false) {
-      return false;
-    }
-
-    return true;
-  }
-
   private boolean handleVersions(ShardResponse srsp, MissedUpdatesFinder missedUpdatesFinder) {
     // we retrieved the last N updates from the replica
     @SuppressWarnings({"unchecked"})
@@ -440,8 +420,8 @@ public class PeerSync implements SolrMetricProducer {
     SyncShardRequest sreq = (SyncShardRequest) srsp.getShardRequest();
     Object fingerprint = srsp.getSolrResponse().getResponse().get("fingerprint");
 
-    if (log.isInfoEnabled()) {
-      log.info("{} Received {} versions from {} fingerprint:{}", msg(), otherVersions.size(), sreq.shards[0], fingerprint);
+    if (log.isDebugEnabled()) {
+      log.debug("{} Received {} versions from {} {} fingerprint:{}", msg(), otherVersions.size(), otherVersions, sreq.shards[0], fingerprint);
     }
     if (fingerprint != null) {
       sreq.fingerprint = IndexFingerprint.fromObject(fingerprint);
@@ -454,7 +434,7 @@ public class PeerSync implements SolrMetricProducer {
     
     MissedUpdatesRequest updatesRequest = missedUpdatesFinder.find(
         otherVersions, sreq.shards[0],
-        () -> core.getSolrConfig().useRangeVersionsForPeerSync && canHandleVersionRanges(sreq.shards[0]));
+        () -> core.getSolrConfig().useRangeVersionsForPeerSync);
 
     if (updatesRequest == MissedUpdatesRequest.ALREADY_IN_SYNC) {
       return true;
@@ -524,7 +504,7 @@ public class PeerSync implements SolrMetricProducer {
 
     SyncShardRequest sreq = (SyncShardRequest) srsp.getShardRequest();
     if (updates.size() < sreq.totalRequestedUpdates) {
-      log.error("{} Requested {} updates from {} but retrieved {}", msg(), sreq.totalRequestedUpdates, sreq.shards[0], updates.size());
+      log.error("{} Requested {} updates from {} but retrieved {} {}", msg(), sreq.totalRequestedUpdates, sreq.shards[0], updates.size(), srsp.getSolrResponse().getResponse());
       return false;
     }
     
@@ -746,7 +726,7 @@ public class PeerSync implements SolrMetricProducer {
       return true;
     }
 
-    MissedUpdatesRequest handleVersionsWithRanges(List<Long> otherVersions, boolean completeList) {
+    static MissedUpdatesRequest handleVersionsWithRanges(List<Long> ourUpdates, List<Long> otherVersions, boolean completeList, long ourLowThreshold) {
       // we may endup asking for updates for too many versions, causing 2MB post payload limit. Construct a range of
       // versions to request instead of asking individual versions
       List<String> rangesToRequest = new ArrayList<>();
@@ -788,6 +768,9 @@ public class PeerSync implements SolrMetricProducer {
       }
 
       String rangesToRequestStr = rangesToRequest.stream().collect(Collectors.joining(","));
+
+      if (log.isDebugEnabled()) log.debug("handleVersionsWithRanges rangesToRequestStr={} otherVersions={} ourVersions={} completeList={} totalRequestedVersions={}", rangesToRequestStr, otherVersions, ourUpdates, completeList, totalRequestedVersions);
+
       return MissedUpdatesRequest.of(rangesToRequestStr, totalRequestedVersions);
     }
 
@@ -867,7 +850,7 @@ public class PeerSync implements SolrMetricProducer {
 
       MissedUpdatesRequest updatesRequest;
       if (canHandleVersionRanges.get()) {
-        updatesRequest = handleVersionsWithRanges(otherVersions, completeList);
+        updatesRequest = handleVersionsWithRanges(ourUpdates, otherVersions, completeList, ourLowThreshold);
       } else {
         updatesRequest = handleIndividualVersions(otherVersions, completeList);
       }
diff --git a/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java b/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
index 4582fc6..fd78e3c 100644
--- a/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
+++ b/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
@@ -19,6 +19,7 @@ package org.apache.solr.update;
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import java.util.function.Supplier;
@@ -119,15 +120,17 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
 
   /**
    * Sync with leader
-   * @param startingVersions : recent versions on startup
+   * @param startVersions : recent versions on startup
    * @return result of PeerSync with leader
    */
-  public PeerSync.PeerSyncResult sync(List<Long> startingVersions){
+  public PeerSync.PeerSyncResult sync(List<Long> startVersions){
     if (ulog == null) {
       syncErrors.inc();
       return PeerSync.PeerSyncResult.failure();
     }
 
+    ArrayList<Long> startingVersions = new ArrayList<>(startVersions);
+
     if (startingVersions.isEmpty()) {
       NamedList<Object> rsp = getVersions();
       IndexFingerprint fingerPrint = getFingerprint(rsp);
@@ -171,9 +174,12 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
 
       // now make sure that the starting updates overlap our updates
       // there shouldn't be reorders, so any overlap will do.
-      long smallestNewUpdate = Math.abs(ourUpdates.get(ourUpdates.size() - 1));
+      long smallestNewUpdate = 0;
+      if (ourUpdates.size() > 0) {
+        smallestNewUpdate = Math.abs(ourUpdates.get(ourUpdates.size() - 1));
+      }
 
-      if (Math.abs(startingVersions.get(0)) < smallestNewUpdate) {
+      if (!startingVersions.isEmpty() && Math.abs(startingVersions.get(0)) < smallestNewUpdate) {
         log.warn("{} too many updates received since start - startingUpdates no longer overlaps with our currentUpdates", msg());
         syncErrors.inc();
         return PeerSync.PeerSyncResult.failure();
@@ -242,9 +248,9 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
       return MissedUpdatesRequest.UNABLE_TO_SYNC;
     }
 
-    MissedUpdatesRequest updatesRequest = missedUpdatesFinder.find(otherVersions, leaderUrl, () -> core.getSolrConfig().useRangeVersionsForPeerSync && canHandleVersionRanges());
+    MissedUpdatesRequest updatesRequest = missedUpdatesFinder.find(otherVersions, leaderUrl, () -> core.getSolrConfig().useRangeVersionsForPeerSync);
     if (updatesRequest == MissedUpdatesRequest.EMPTY) {
-      if (doFingerprint) return MissedUpdatesRequest.UNABLE_TO_SYNC;
+      if (doFingerprint && updatesRequest.totalRequestedUpdates > 0) return MissedUpdatesRequest.UNABLE_TO_SYNC;
       return MissedUpdatesRequest.ALREADY_IN_SYNC;
     }
 
@@ -262,6 +268,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
     params.set(DISTRIB, false);
     params.set("getUpdates", missedUpdatesRequest.versionsAndRanges);
     params.set("onlyIfActive", false);
+    params.set("onlyIfLeader", true);
     params.set("skipDbq", true);
 
     return request(params, "Failed on getting missed updates from the leader");
@@ -273,7 +280,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
     List<Object> updates = (List<Object>)rsp.get("updates");
 
     if (updates.size() < numRequestedUpdates) {
-      log.error("{} Requested {} updated from {} but retrieved {}", msg(), numRequestedUpdates, leaderUrl, updates.size());
+      log.error("{} Requested {} updated from {} but retrieved {} {}", msg(), numRequestedUpdates, leaderUrl, updates.size(), rsp);
       return false;
     }
 
@@ -298,13 +305,16 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
         // only DBI or DBQ in the gap (above) will satisfy this predicate
         return version > leaderFingerprint.getMaxVersionEncountered() && (oper == UpdateLog.DELETE || oper == UpdateLog.DELETE_BY_QUERY);
       });
+      if (log.isDebugEnabled()) log.debug("existDBIOrDBQInTheGap={}", existDBIOrDBQInTheGap);
       if (!existDBIOrDBQInTheGap) {
         // it is safe to use leaderFingerprint.maxVersionEncountered as cut point now.
         updates.removeIf(e -> {
           @SuppressWarnings({"unchecked"})
           List<Object> u = (List<Object>) e;
           long version = (Long) u.get(1);
-          return version > leaderFingerprint.getMaxVersionEncountered();
+          boolean success = version > leaderFingerprint.getMaxVersionEncountered();
+          if (log.isDebugEnabled()) log.debug("existDBIOrDBQInTheGap version={}  leaderFingerprint.getMaxVersionEncountered={} success={}", version, leaderFingerprint.getMaxVersionEncountered(), success);
+          return success;
         });
       }
     }
@@ -312,24 +322,12 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
     try {
       updater.applyUpdates(updates, leaderUrl);
     } catch (Exception e) {
+      log.error("Could not apply updates", e);
       return false;
     }
     return true;
   }
 
-  // determine if leader can handle version ranges
-  private boolean canHandleVersionRanges() {
-    ModifiableSolrParams params = new ModifiableSolrParams();
-    params.set("qt", "/get");
-    params.set(DISTRIB, false);
-    params.set("checkCanHandleVersionRanges", false);
-
-    NamedList<Object> rsp = request(params, "Failed on determine if leader can handle version ranges");
-    Boolean canHandleVersionRanges = rsp.getBooleanArg("canHandleVersionRanges");
-
-    return canHandleVersionRanges != null && canHandleVersionRanges;
-  }
-
   private NamedList<Object> request(ModifiableSolrParams params, String onFail) {
     try {
       QueryRequest qr = new QueryRequest(params, SolrRequest.METHOD.POST);
@@ -351,6 +349,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
     params.set(DISTRIB,false);
     params.set("getVersions",nUpdates);
     params.set("fingerprint",doFingerprint);
+    params.set("onlyIfLeader", true);
 
     return request(params, "Failed to get recent versions from leader");
   }
@@ -360,6 +359,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
     params.set("qt", "/get");
     params.set(DISTRIB,false);
     params.set("getFingerprint", String.valueOf(Long.MAX_VALUE));
+    params.set("onlyIfLeader", true);
 
     NamedList<Object> rsp = request(params, "Failed to get fingerprint from leader");
     IndexFingerprint leaderFingerprint = getFingerprint(rsp);
@@ -386,6 +386,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
       if (cmp != 0) {
         if (log.isDebugEnabled()) log.debug("Leader fingerprint: {}, Our fingerprint: {}", leaderFingerprint , ourFingerprint);
       }
+
       return cmp == 0;  // currently, we only check for equality...
     } catch (IOException e) {
       log.warn("Could not confirm if we are already in sync. Continue with PeerSync");
@@ -426,7 +427,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
       boolean completeList = leaderVersions.size() < nUpdates;
       MissedUpdatesRequest updatesRequest;
       if (canHandleVersionRanges.get()) {
-        updatesRequest = handleVersionsWithRanges(leaderVersions, completeList);
+        updatesRequest = handleVersionsWithRanges(ourUpdates, leaderVersions, completeList, ourLowThreshold);
       } else {
         updatesRequest = handleIndividualVersions(leaderVersions, completeList);
       }
diff --git a/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java b/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
index 8ef8913..5f781ad 100644
--- a/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
+++ b/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
@@ -23,7 +23,6 @@ import java.lang.invoke.MethodHandles;
 import java.util.List;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.TimeUnit;
 
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.BinaryResponseParser;
@@ -120,7 +119,7 @@ public class SolrCmdDistributor implements Closeable {
 
     // this can happen in certain situations such as close
     if (isRetry) {
-      if (rspCode == 404 || rspCode == 403 || rspCode == 503) {
+      if (rspCode == 403 || rspCode == 503) {
         doRetry = true;
       }
 
@@ -281,9 +280,12 @@ public class SolrCmdDistributor implements Closeable {
 
         @Override
         public void onFailure(Throwable t, int code) {
-          log.error("Exception sending dist update {}", code, t);
+          log.error("Exception sending dist update {} {}", code, t);
           cancels.remove(cancelIndex);
 
+
+          // nocommit - we want to prevent any more from this request
+          // to go just to this node rather than stop the whole request
           if (code == 404) {
             cancelExeption = t;
             return;
diff --git a/solr/core/src/java/org/apache/solr/update/SolrCoreState.java b/solr/core/src/java/org/apache/solr/update/SolrCoreState.java
index c53fe21..b0885ef 100644
--- a/solr/core/src/java/org/apache/solr/update/SolrCoreState.java
+++ b/solr/core/src/java/org/apache/solr/update/SolrCoreState.java
@@ -70,7 +70,7 @@ public abstract class SolrCoreState {
     boolean close = false;
     synchronized (this) {
       solrCoreStateRefCnt--;
-      log.info("SolrCoreState ref count {}", solrCoreStateRefCnt);
+      if (log.isDebugEnabled()) log.debug("SolrCoreState ref count {}", solrCoreStateRefCnt);
 
       if (solrCoreStateRefCnt == 0) {
         closed = true;
@@ -80,7 +80,7 @@ public abstract class SolrCoreState {
     
     if (close) {
       try {
-        log.debug("Closing SolrCoreState");
+        if (log.isDebugEnabled()) log.debug("Closing SolrCoreState");
         close(closer);
       } catch (Exception e) {
         log.error("Error closing SolrCoreState", e);
diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java b/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
index 3e55dab..d66f777 100644
--- a/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
+++ b/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
@@ -291,7 +291,7 @@ public class SolrIndexSplitter {
           t = timings.sub("createSubIW");
           t.resume();
           iw = SolrIndexWriter.buildIndexWriter(core, partitionName, path, core.getDirectoryFactory(), true, core.getLatestSchema(),
-                  core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec());
+                  core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec(), true);
           t.pause();
         }
       }
diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java
index 2674487..67eb8ce 100644
--- a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java
+++ b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java
@@ -112,12 +112,12 @@ public class SolrIndexWriter extends IndexWriter {
 //    return w;
 //  }
 
-  public static SolrIndexWriter buildIndexWriter(SolrCore core, String name, String path, DirectoryFactory directoryFactory, boolean create, IndexSchema schema, SolrIndexConfig config, IndexDeletionPolicy delPolicy, Codec codec) {
+  public static SolrIndexWriter buildIndexWriter(SolrCore core, String name, String path, DirectoryFactory directoryFactory, boolean create, IndexSchema schema, SolrIndexConfig config, IndexDeletionPolicy delPolicy, Codec codec, boolean commitOnClose) {
     SolrIndexWriter iw = null;
     Directory dir = null;
     try {
       dir = getDir(directoryFactory, path, config);
-      iw = new SolrIndexWriter(core, name, directoryFactory, dir, create, schema, config, delPolicy, codec);
+      iw = new SolrIndexWriter(core, name, directoryFactory, dir, create, schema, config, delPolicy, codec, commitOnClose);
     } catch (Throwable e) {
       ParWork.propagateInterrupt(e);
       SolrException exp = new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
@@ -170,11 +170,11 @@ public class SolrIndexWriter extends IndexWriter {
     assert ObjectReleaseTracker.track(this);
   }
 
-  public SolrIndexWriter(SolrCore core, String name, DirectoryFactory directoryFactory, Directory directory, boolean create, IndexSchema schema, SolrIndexConfig config, IndexDeletionPolicy delPolicy, Codec codec) throws IOException {
+  public SolrIndexWriter(SolrCore core, String name, DirectoryFactory directoryFactory, Directory directory, boolean create, IndexSchema schema, SolrIndexConfig config, IndexDeletionPolicy delPolicy, Codec codec, boolean commitOnClose) throws IOException {
     super(directory,
             config.toIndexWriterConfig(core).
                     setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND).
-                    setIndexDeletionPolicy(delPolicy).setCodec(codec)
+                    setIndexDeletionPolicy(delPolicy).setCodec(codec).setCommitOnClose(commitOnClose)
     );
     try {
     if (log.isDebugEnabled()) log.debug("Opened Writer " + name);
@@ -252,15 +252,11 @@ public class SolrIndexWriter extends IndexWriter {
   @SuppressForbidden(reason = "Need currentTimeMillis, commit time should be used only for debugging purposes, " +
           " but currently suspiciously used for replication as well")
   public static void setCommitData(IndexWriter iw, long commitCommandVersion) {
-    log.info("Calling setCommitData with IW:" + iw.toString() + " commitCommandVersion:"+commitCommandVersion);
+    if (log.isDebugEnabled()) log.debug("Calling setCommitData with IW:" + iw.toString() + " commitCommandVersion:"+commitCommandVersion);
     final Map<String,String> commitData = new HashMap<>();
     commitData.put(COMMIT_TIME_MSEC_KEY, String.valueOf(System.currentTimeMillis()));
     commitData.put(COMMIT_COMMAND_VERSION, String.valueOf(commitCommandVersion));
     iw.setLiveCommitData(commitData.entrySet());
-
-    if (log.isDebugEnabled()) {
-      log.debug("setCommitData(IndexWriter, long) - end");
-    }
   }
 
   // we override this method to collect metrics for merges.
@@ -346,18 +342,12 @@ public class SolrIndexWriter extends IndexWriter {
 
   @Override
   protected void doAfterFlush() throws IOException {
-    if (log.isDebugEnabled()) {
-      log.debug("doAfterFlush() - start");
-    }
+    if (log.isTraceEnabled()) log.trace("doAfterFlush() - start");
 
     if (flushMeter != null) { // this is null when writer is used only for snapshot cleanup
       flushMeter.mark();      // or if mergeTotals == false
     }
     super.doAfterFlush();
-
-    if (log.isDebugEnabled()) {
-      log.debug("doAfterFlush() - end");
-    }
   }
 
   @Override
diff --git a/solr/core/src/java/org/apache/solr/update/UpdateLog.java b/solr/core/src/java/org/apache/solr/update/UpdateLog.java
index 0f51d41..8048060 100644
--- a/solr/core/src/java/org/apache/solr/update/UpdateLog.java
+++ b/solr/core/src/java/org/apache/solr/update/UpdateLog.java
@@ -190,25 +190,25 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
   protected volatile TransactionLog bufferTlog;
   protected volatile TransactionLog tlog;
 
-  protected TransactionLog prevTlog;
+  protected volatile TransactionLog prevTlog;
   protected TransactionLog prevTlogOnPrecommit;
   protected final Deque<TransactionLog> logs = new LinkedList<>();  // list of recent logs, newest first
   protected final LinkedList<TransactionLog> newestLogsOnStartup = new LinkedList<>();
-  protected int numOldRecords;  // number of records in the recent logs
+  protected volatile int numOldRecords;  // number of records in the recent logs
 
   protected volatile Map<BytesRef,LogPtr> map = new ConcurrentHashMap<>(32);
   protected volatile Map<BytesRef,LogPtr> prevMap;  // used while committing/reopening is happening
   protected volatile Map<BytesRef,LogPtr> prevMap2;  // used while committing/reopening is happening
-  protected TransactionLog prevMapLog;  // the transaction log used to look up entries found in prevMap
-  protected TransactionLog prevMapLog2;  // the transaction log used to look up entries found in prevMap2
+  protected volatile TransactionLog prevMapLog;  // the transaction log used to look up entries found in prevMap
+  protected volatile TransactionLog prevMapLog2;  // the transaction log used to look up entries found in prevMap2
 
   protected final int numDeletesToKeep = 1000;
   protected final int numDeletesByQueryToKeep = 100;
-  protected int numRecordsToKeep;
+  protected volatile int numRecordsToKeep;
   protected volatile int maxNumLogsToKeep;
   protected volatile int numVersionBuckets = 65536; // This should only be used to initialize VersionInfo... the actual number of buckets may be rounded up to a power of two.
-  protected Long maxVersionFromIndex = null;
-  protected boolean existOldBufferLog = false;
+  protected volatile Long maxVersionFromIndex = null;
+  protected volatile boolean existOldBufferLog = false;
 
   // keep track of deletes only... this is not updated on an add
   protected Map<BytesRef, LogPtr> oldDeletes = Collections.synchronizedMap(new LinkedHashMap<BytesRef, LogPtr>(numDeletesToKeep) {
@@ -435,7 +435,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
 
       // TODO: these startingVersions assume that we successfully recover from all non-complete tlogs.
       try (RecentUpdates startingUpdates = getRecentUpdates()) {
-        startingVersions = startingUpdates.getVersions(numRecordsToKeep);
+        startingVersions = Collections.unmodifiableList(startingUpdates.getVersions(numRecordsToKeep));
 
         // populate recent deletes list (since we can't get that info from the index)
         for (int i = startingUpdates.deleteList.size() - 1; i >= 0; i--) {
@@ -1142,22 +1142,18 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
   // synchronization is needed for stronger guarantees (as VersionUpdateProcessor does).
   public Long lookupVersion(BytesRef indexedId) {
     LogPtr entry;
-    tlogLock.lock();
-    try {
-      entry = map.get(indexedId);
-      // SolrCore.verbose("TLOG: lookup ver: for id ",indexedId.utf8ToString(),"in map",System.identityHashCode(map),"got",entry,"lookupLog=",lookupLog);
-      if (entry == null && prevMap != null) {
-        entry = prevMap.get(indexedId);
-        // something found in prevMap will always be found in prevMapLog (which could be tlog or prevTlog)
-        // SolrCore.verbose("TLOG: lookup ver: for id ",indexedId.utf8ToString(),"in prevMap",System.identityHashCode(map),"got",entry,"lookupLog=",lookupLog);
-      }
-      if (entry == null && prevMap2 != null) {
-        entry = prevMap2.get(indexedId);
-        // something found in prevMap2 will always be found in prevMapLog2 (which could be tlog or prevTlog)
-        // SolrCore.verbose("TLOG: lookup ver: for id ",indexedId.utf8ToString(),"in prevMap2",System.identityHashCode(map),"got",entry,"lookupLog=",lookupLog);
-      }
-    } finally {
-      tlogLock.unlock();
+
+    entry = map.get(indexedId);
+    // SolrCore.verbose("TLOG: lookup ver: for id ",indexedId.utf8ToString(),"in map",System.identityHashCode(map),"got",entry,"lookupLog=",lookupLog);
+    if (entry == null && prevMap != null) {
+      entry = prevMap.get(indexedId);
+      // something found in prevMap will always be found in prevMapLog (which could be tlog or prevTlog)
+      // SolrCore.verbose("TLOG: lookup ver: for id ",indexedId.utf8ToString(),"in prevMap",System.identityHashCode(map),"got",entry,"lookupLog=",lookupLog);
+    }
+    if (entry == null && prevMap2 != null) {
+      entry = prevMap2.get(indexedId);
+      // something found in prevMap2 will always be found in prevMapLog2 (which could be tlog or prevTlog)
+      // SolrCore.verbose("TLOG: lookup ver: for id ",indexedId.utf8ToString(),"in prevMap2",System.identityHashCode(map),"got",entry,"lookupLog=",lookupLog);
     }
 
 
@@ -1174,12 +1170,8 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
 
     // We can't get any version info for deletes from the index, so if the doc
     // wasn't found, check a cache of recent deletes.
-    tlogLock.lock();
-    try {
-      entry = oldDeletes.get(indexedId);
-    } finally {
-      tlogLock.unlock();
-    }
+
+    entry = oldDeletes.get(indexedId);
 
     if (entry != null) {
       return entry.version;
@@ -1195,15 +1187,10 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
     if (syncLevel == SyncLevel.NONE) {
       return;
     }
-    TransactionLog currLog;
-    tlogLock.lock();
-    try {
-      currLog = tlog;
-      if (currLog == null) return;
-      currLog.incref();
-    } finally {
-      tlogLock.unlock();
-    }
+
+    TransactionLog currLog = tlog;
+    if (currLog == null) return;
+    currLog.incref();
 
     try {
       currLog.finish(syncLevel);
@@ -1327,20 +1314,16 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
   public void commitAndSwitchToNewTlog(CommitUpdateCommand cuc) {
     versionInfo.blockUpdates();
     try {
-      tlogLock.lock();
+      if (tlog == null) {
+        return;
+      }
+      preCommit(cuc);
       try {
-        if (tlog == null) {
-          return;
-        }
-        preCommit(cuc);
-        try {
-          copyOverOldUpdates(cuc.getVersion());
-        } finally {
-          postCommit(cuc);
-        }
+        copyOverOldUpdates(cuc.getVersion());
       } finally {
-        tlogLock.unlock();
+        postCommit(cuc);
       }
+
     } finally {
       versionInfo.unblockUpdates();
     }
@@ -1611,19 +1594,31 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
       for (List<Update> singleList : updateList) {
         for (Update ptr : singleList) {
           if(Math.abs(ptr.version) > Math.abs(maxVersion)) continue;
-          ret.add(ptr.version);
+          if (ptr.version != 0) {
+            ret.add(ptr.version);
+          } else {
+            log.warn("Found version of 0 {} {} {}", ptr.pointer, ptr.previousVersion, ptr.log);
+          }
           if (--n <= 0) return ret;
         }
       }
+      if (log.isDebugEnabled()) log.debug("Return getVersions {} {}", n, ret);
 
       return ret;
     }
 
     public Object lookup(long version) {
+      if (log.isDebugEnabled()) log.debug("lookup {}", version);
       Update update = updates.get(version);
       if (update == null) return null;
 
-      return update.log.lookup(update.pointer);
+      if (log.isDebugEnabled()) log.debug("found update from updates {} {}", update.version, updates.size());
+
+      Object object = update.log.lookup(update.pointer);
+
+      if (log.isDebugEnabled()) log.debug("found update from log {} {} ptr={} object={}", update.version, update.log, update.pointer, object);
+
+      return object;
     }
 
     /** Returns the list of deleteByQueries that happened after the given version */
@@ -1640,7 +1635,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
 
     private void update() {
       int numUpdates = 0;
-      updateList = new ArrayList<>(logList.size());
+      updateList = new ArrayList<>(numRecordsToKeep);
       deleteByQueryList = new ArrayList<>();
       deleteList = new ArrayList<>();
       updates = new HashMap<>(numRecordsToKeep);
@@ -1705,12 +1700,14 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
               // would be caused by a corrupt transaction log
             } catch (Exception ex) {
               log.warn("Exception reverse reading log", ex);
-              break;
+             // break;
             }
 
             numUpdates++;
           }
 
+          if (log.isDebugEnabled()) log.debug("Recent updates updates numUpdates={} numUpdatesToKeep={}", numUpdates, numRecordsToKeep);
+
         } catch (IOException | AssertionError e) { // catch AssertionError to handle certain test failures correctly
           // failure to read a log record isn't fatal
           log.error("Exception reading versions from log",e);
@@ -1744,6 +1741,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
   public RecentUpdates getRecentUpdates() {
     Deque<TransactionLog> logList;
     tlogLock.lock();
+    RecentUpdates recentUpdates;
     try {
       logList = new LinkedList<>(logs);
       for (TransactionLog log : logList) {
@@ -1761,14 +1759,15 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
         bufferTlog.incref();
         logList.addFirst(bufferTlog);
       }
+
+      recentUpdates = new RecentUpdates(logList, numRecordsToKeep);
     } finally {
       tlogLock.unlock();
     }
 
     // TODO: what if I hand out a list of updates, then do an update, then hand out another list (and
     // one of the updates I originally handed out fell off the list).  Over-request?
-    return new RecentUpdates(logList, numRecordsToKeep);
-
+    return recentUpdates;
   }
 
   public void bufferUpdates() {
diff --git a/solr/core/src/java/org/apache/solr/update/processor/AddSchemaFieldsUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/AddSchemaFieldsUpdateProcessorFactory.java
index 3a26571..888926f 100644
--- a/solr/core/src/java/org/apache/solr/update/processor/AddSchemaFieldsUpdateProcessorFactory.java
+++ b/solr/core/src/java/org/apache/solr/update/processor/AddSchemaFieldsUpdateProcessorFactory.java
@@ -478,7 +478,7 @@ public class AddSchemaFieldsUpdateProcessorFactory extends UpdateRequestProcesso
 
           newSchema = oldSchema.addFields(newFields, Collections.emptyMap(), false);
 
-          log.info("Old schema version for request is {} version for latest on core is {} new schema version={}", ((ManagedIndexSchema) oldSchema).getSchemaZkVersion(), ((ManagedIndexSchema) core.getLatestSchema()).getSchemaZkVersion(), ((ManagedIndexSchema) newSchema).getSchemaZkVersion());
+          if (log.isDebugEnabled()) log.debug("Old schema version for request is {} version for latest on core is {} new schema version={}", ((ManagedIndexSchema) oldSchema).getSchemaZkVersion(), ((ManagedIndexSchema) core.getLatestSchema()).getSchemaZkVersion(), ((ManagedIndexSchema) newSchema).getSchemaZkVersion());
 
           // Add copyFields
           for (Map.Entry<String,Map<Integer,List<CopyFieldDef>>> entry : newCopyFields.entrySet()) {
@@ -512,7 +512,7 @@ public class AddSchemaFieldsUpdateProcessorFactory extends UpdateRequestProcesso
             ((ManagedIndexSchema) cmd.getReq().getSchema()).getManagedIndexSchemaFactory().getZkIndexSchemaReader().updateSchema();
             cmd.getReq().updateSchemaToLatest();
 
-            log.info("Schema changed while processing request ... current latest version {} try={}", ((ManagedIndexSchema) cmd.getReq().getSchema()).getSchemaZkVersion(), cnt);
+            if (log.isDebugEnabled()) log.debug("Schema changed while processing request ... current latest version {} try={}", ((ManagedIndexSchema) cmd.getReq().getSchema()).getSchemaZkVersion(), cnt);
           } catch (KeeperException.SessionExpiredException keeperException) {
             throw new SolrException(SERVER_ERROR, keeperException);
           } catch (Exception e1) {
diff --git a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
index 08278d2..7c9628c 100644
--- a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
+++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
@@ -20,6 +20,7 @@ import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.List;
 import java.util.Set;
+import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
@@ -258,7 +259,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
 
         if (!doDist) {
           // TODO: possibly set checkDeleteByQueries as a flag on the command?
-          if (log.isDebugEnabled()) log.debug("Local add cmd {}", cmd.solrDoc);
+          if (log.isTraceEnabled()) log.trace("Local add cmd {}", cmd.solrDoc);
           doLocalAdd(cmd);
 
           // if the update updates a doc that is part of a nested structure,
@@ -292,42 +293,56 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
       } else {
         finalCloneCmd = cmd;
       }
-      distFuture = ParWork.getRootSharedExecutor().submit(() -> {
+
+      Callable distCall = () -> {
         if (log.isTraceEnabled()) log.trace("Run distrib add collection");
 
         try {
           doDistribAdd(finalCloneCmd);
           if (log.isTraceEnabled()) log.trace("after distrib add collection");
         } catch (Throwable e) {
-          ParWork.propagateInterrupt(e);
           throw new SolrException(ErrorCode.SERVER_ERROR, e);
         }
-      });
+        return null;
+      };
+
+      if (!forwardToLeader) {
+        distFuture = ParWork.getRootSharedExecutor().submit(distCall);
+      } else {
+        try {
+          distCall.call();
+        } catch (Exception e) {
+          throw new SolrException(ErrorCode.SERVER_ERROR, e);
+        }
+      }
 
       // TODO: possibly set checkDeleteByQueries as a flag on the command?
       // if the update updates a doc that is part of a nested structure,
       // force open a realTimeSearcher to trigger a ulog cache refresh.
       // This refresh makes RTG handler aware of this update.q
 
-      // TODO: possibly set checkDeleteByQueries as a flag on the command?
-      if (log.isDebugEnabled()) log.debug("Local add cmd {}", cmd.solrDoc);
-      try {
-        doLocalAdd(cmd);
-      } catch (Exception e) {
-        if (distFuture != null) {
-          distFuture.cancel(true);
-        }
-        if (e instanceof RuntimeException) {
-          throw (RuntimeException) e;
+
+      if (!forwardToLeader) {
+        // TODO: possibly set checkDeleteByQueries as a flag on the command?
+        if (log.isTraceEnabled()) log.trace("Local add cmd {}", cmd.solrDoc);
+        try {
+          doLocalAdd(cmd);
+        } catch (Exception e) {
+          if (distFuture != null) {
+            distFuture.cancel(false);
+          }
+          if (e instanceof RuntimeException) {
+            throw (RuntimeException) e;
+          }
+          throw new SolrException(ErrorCode.SERVER_ERROR, e);
         }
-        throw new SolrException(ErrorCode.SERVER_ERROR, e);
-      }
-      // if the update updates a doc that is part of a nested structure,
-      // force open a realTimeSearcher to trigger a ulog cache refresh.
-      // This refresh makes RTG handler aware of this update.q
-      if (ulog != null) {
-        if (req.getSchema().isUsableForChildDocs() && shouldRefreshUlogCaches(cmd)) {
-          ulog.openRealtimeSearcher();
+        // if the update updates a doc that is part of a nested structure,
+        // force open a realTimeSearcher to trigger a ulog cache refresh.
+        // This refresh makes RTG handler aware of this update.q
+        if (ulog != null) {
+          if (req.getSchema().isUsableForChildDocs() && shouldRefreshUlogCaches(cmd)) {
+            ulog.openRealtimeSearcher();
+          }
         }
       }
 
@@ -945,7 +960,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
           t = e;
         }
         if (distFuture != null) {
-          distFuture.cancel(true);
+          distFuture.cancel(false);
         }
         if (t instanceof SolrException) {
           throw (SolrException) t;
diff --git a/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
index a5a8847..8b5d07b 100644
--- a/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
+++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
@@ -190,7 +190,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
             EnumSet.of(Replica.Type.TLOG, Replica.Type.NRT), true);
 
         try {
-          leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId(), 1000, false);
+          leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId(), 3000, false);
         } catch (Exception e) {
           ParWork.propagateInterrupt(e);
           throw new SolrException(ErrorCode.SERVER_ERROR,
@@ -563,7 +563,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
             collection, myShardId);
         // DBQ forwarded to NRT and TLOG replicas
         List<Replica> replicaProps = zkController.getZkStateReader()
-            .getReplicaProps(collection, myShardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.NRT, Replica.Type.TLOG));
+            .getReplicaProps(collection, myShardId, leaderReplica.getName(), Replica.State.BUFFERING, Replica.State.ACTIVE, EnumSet.of(Replica.Type.NRT, Replica.Type.TLOG));
         if (replicaProps != null) {
           final List<SolrCmdDistributor.Node> myReplicas = new ArrayList<>(replicaProps.size());
           for (Replica replicaProp : replicaProps) {
@@ -611,7 +611,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
 
       forwardToLeader = false;
       List<Replica> replicaProps = zkController.getZkStateReader()
-          .getReplicaProps(collection, shardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.NRT, Replica.Type.TLOG));
+          .getReplicaProps(collection, shardId, leaderReplica.getName(), Replica.State.BUFFERING, Replica.State.ACTIVE, EnumSet.of(Replica.Type.NRT, Replica.Type.TLOG));
       if (replicaProps != null) {
         nodes = new ArrayList<>(replicaProps.size());
         for (Replica props : replicaProps) {
@@ -645,7 +645,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
           + "failed since we're not in cloud mode.");
     }
     try {
-      return zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId(), 1500, false).getCoreUrl();
+      return zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId(), 3000, false).getCoreUrl();
     } catch (InterruptedException | TimeoutException e) {
       ParWork.propagateInterrupt(e);
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Exception during fetching from leader.", e);
@@ -717,14 +717,14 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
     try {
       // Not equivalent to getLeaderProps, which  retries to find a leader.
       // Replica leader = slice.getLeader();
-      Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 100, false);
+      Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 3000, false);
       isLeader = leaderReplica.getName().equals(desc.getName());
       if (log.isTraceEnabled()) log.trace("Are we leader for sending to replicas? {} phase={}", isLeader, phase);
       if (!isLeader) {
         isSubShardLeader = amISubShardLeader(coll, slice, id, doc);
         if (isSubShardLeader) {
           shardId = cloudDesc.getShardId();
-          leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 1500, false);
+          leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 3000, false);
         }
       }
 
@@ -779,7 +779,8 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
               log.debug("skip url:{} cause its term is less than leader", replica.getCoreUrl());
             }
             skippedCoreNodeNames.add(replica.getName());
-          } else if (!zkController.getZkStateReader().getLiveNodes().contains(replica.getNodeName()) || replica.getState() == Replica.State.DOWN) {
+          } else if (!zkController.getZkStateReader().getLiveNodes().contains(replica.getNodeName()) || (replica.getState() != Replica.State.ACTIVE &&
+              replica.getState() != Replica.State.BUFFERING)) {
             skippedCoreNodeNames.add(replica.getName());
           } else {
             nodes.add(new SolrCmdDistributor.StdNode(zkController.getZkStateReader(), replica, collection, shardId, maxRetriesToFollowers));
diff --git a/solr/core/src/java/org/apache/solr/util/plugin/AbstractPluginLoader.java b/solr/core/src/java/org/apache/solr/util/plugin/AbstractPluginLoader.java
index 0e0946d..be53737 100644
--- a/solr/core/src/java/org/apache/solr/util/plugin/AbstractPluginLoader.java
+++ b/solr/core/src/java/org/apache/solr/util/plugin/AbstractPluginLoader.java
@@ -162,10 +162,9 @@ public abstract class AbstractPluginLoader<T>
           }
 
           T plugin = create(loader, name, className, node, xpath);
-          if (log.isDebugEnabled()) {
-            log.debug("created {}: {}", ((name != null) ? name : ""), plugin.getClass().getName());
-          }
-          
+
+          if (log.isTraceEnabled()) log.trace("created {}: {}", ((name != null) ? name : ""), plugin.getClass().getName());
+
           // Either initialize now or wait till everything has been registered
           if( preRegister ) {
             info.add( new PluginInitInfo( plugin, node ) );
diff --git a/solr/core/src/test-files/log4j2.xml b/solr/core/src/test-files/log4j2.xml
index d4ae85d..bd8b844 100644
--- a/solr/core/src/test-files/log4j2.xml
+++ b/solr/core/src/test-files/log4j2.xml
@@ -21,7 +21,7 @@
     <Console name="STDERR" target="SYSTEM_ERR">
       <PatternLayout>
         <Pattern>
-          %maxLen{%-4r %-5p (%t) [%X{node_name} %X{collection} %X{shard} %X{replica} %X{core} %X{trace_id}] %c{1.} %m%notEmpty{
+          %maxLen{%-4r %-5p (%-5t) [%X{node_name} %X{core}] %c{1.} %m%notEmpty{
           =>%ex{short}}}{10240}%n
         </Pattern>
       </PatternLayout>
diff --git a/solr/core/src/test/org/apache/solr/CursorPagingTest.java b/solr/core/src/test/org/apache/solr/CursorPagingTest.java
index 498b3c7..76967fa 100644
--- a/solr/core/src/test/org/apache/solr/CursorPagingTest.java
+++ b/solr/core/src/test/org/apache/solr/CursorPagingTest.java
@@ -49,7 +49,6 @@ import java.util.UUID;
 /**
  * Tests of deep paging using {@link CursorMark} and {@link CursorMarkParams#CURSOR_MARK_PARAM}.
  */
-// TODO bad seed? DCC82A1EDB76AEC 9637DF7A121FD190
 public class CursorPagingTest extends SolrTestCaseJ4 {
 
   /** solrconfig.xml file name, shared with other cursor related tests */
@@ -63,9 +62,14 @@ public class CursorPagingTest extends SolrTestCaseJ4 {
   @BeforeClass
   public static void beforeTests() throws Exception {
     // we need DVs on point fields to compute stats & facetsew
+    System.setProperty(NUMERIC_POINTS_SYSPROP, "true");
+    randomizeNumericTypesProperties();
+    System.setProperty(NUMERIC_DOCVALUES_SYSPROP, "true");
+
     System.setProperty("solr.test.useFilterForSortedQuery", Boolean.toString(random().nextBoolean()));
     initCore(TEST_SOLRCONFIG_NAME, TEST_SCHEMAXML_NAME);
   }
+
   @After
   public void cleanup() throws Exception {
     assertU(delQ("*:*"));
diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java
index 9864470..604abec 100644
--- a/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java
@@ -23,7 +23,6 @@ import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.EnumSet;
 import java.util.List;
-import java.util.Set;
 import java.util.concurrent.Future;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
@@ -63,6 +62,7 @@ import org.slf4j.LoggerFactory;
 import static org.apache.solr.common.cloud.Replica.State.DOWN;
 
 // TODO: this is flakey, can rarely leak a Directory
+// The UnloadCoreOnDeletedWatcher has been removed
 @SolrTestCase.SuppressObjectReleaseTracker(object = "NRTCachingDirectory")
 public class DeleteReplicaTest extends SolrCloudTestCase {
 
@@ -127,8 +127,8 @@ public class DeleteReplicaTest extends SolrCloudTestCase {
     JettySolrRunner replicaJetty = cluster.getReplicaJetty(replica);
     ZkStateReaderAccessor accessor = new ZkStateReaderAccessor(replicaJetty.getCoreContainer().getZkController().getZkStateReader());
 
-    final long preDeleteWatcherCount = countUnloadCoreOnDeletedWatchers
-      (accessor.getStateWatchers(collectionName));
+//    final long preDeleteWatcherCount = countUnloadCoreOnDeletedWatchers
+//      (accessor.getStateWatchers(collectionName));
     
     CollectionAdminRequest.deleteReplica(collectionName, shard.getName(), replica.getName())
         .process(cluster.getSolrClient());
@@ -224,9 +224,9 @@ public class DeleteReplicaTest extends SolrCloudTestCase {
     
     JettySolrRunner replicaJetty = cluster.getReplicaJetty(replica);
     ZkStateReaderAccessor accessor = new ZkStateReaderAccessor(replicaJetty.getCoreContainer().getZkController().getZkStateReader());
-
-    final long preDeleteWatcherCount = countUnloadCoreOnDeletedWatchers
-      (accessor.getStateWatchers(collectionName));
+//
+//    final long preDeleteWatcherCount = countUnloadCoreOnDeletedWatchers
+//      (accessor.getStateWatchers(collectionName));
 
     ZkNodeProps m = new ZkNodeProps(
         Overseer.QUEUE_OPERATION, OverseerAction.DELETECORE.toLower(),
@@ -245,14 +245,14 @@ public class DeleteReplicaTest extends SolrCloudTestCase {
     );
     
     // the core should no longer have a watch collection state since it was removed
-    timeOut = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME);
-    timeOut.waitFor("Waiting for core's watcher to be removed", () -> {
-        final long postDeleteWatcherCount = countUnloadCoreOnDeletedWatchers
-          (accessor.getStateWatchers(collectionName));
-        log.info("preDeleteWatcherCount={} vs postDeleteWatcherCount={}",
-                 preDeleteWatcherCount, postDeleteWatcherCount);
-        return (preDeleteWatcherCount - 1L == postDeleteWatcherCount);
-      });
+//    timeOut = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+//    timeOut.waitFor("Waiting for core's watcher to be removed", () -> {
+//        final long postDeleteWatcherCount = countUnloadCoreOnDeletedWatchers
+//          (accessor.getStateWatchers(collectionName));
+//        log.info("preDeleteWatcherCount={} vs postDeleteWatcherCount={}",
+//                 preDeleteWatcherCount, postDeleteWatcherCount);
+//        return (preDeleteWatcherCount - 1L == postDeleteWatcherCount);
+//      });
     
     CollectionAdminRequest.deleteCollection(collectionName).process(cluster.getSolrClient());
   }
@@ -424,19 +424,5 @@ public class DeleteReplicaTest extends SolrCloudTestCase {
       throw e;
     }
   }
-
-  /** 
-   * Helper method for counting the number of instances of <code>UnloadCoreOnDeletedWatcher</code>
-   * that exist on a given node.
-   *
-   * This is useful for verifying that deleting a replica correctly removed it's watchers.
-   *
-   * (Note: tests should not assert specific values, since multiple replicas may exist on the same 
-   * node. Instead tests should only assert that the number of watchers has decreased by 1 per known 
-   * replica removed)
-   */
-  private static final long countUnloadCoreOnDeletedWatchers(final Set<DocCollectionWatcher> watchers) {
-    return watchers.stream().filter(w -> w instanceof ZkController.UnloadCoreOnDeletedWatcher).count();
-  }
 }
 
diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java
index abd6edd..67bf7f3 100644
--- a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java
@@ -39,6 +39,7 @@ import org.apache.solr.common.cloud.ZkCoreNodeProps;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.SolrNamedThreadFactory;
+import org.apache.solr.core.CoreDescriptor;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.KeeperException.NoNodeException;
 import org.apache.zookeeper.KeeperException.SessionExpiredException;
@@ -95,9 +96,9 @@ public class LeaderElectionTest extends SolrTestCaseJ4 {
 
     public TestLeaderElectionContext(LeaderElector leaderElector,
         String shardId, String collection, String coreNodeName, Replica props,
-        ZkController zkController, long runLeaderDelay) {
+        ZkController zkController, long runLeaderDelay, CoreDescriptor cd) {
       super (coreNodeName, "/collections/" + collection,
-              "/collections/" + collection + "/leader", props, zkController.getZkClient());
+              "/collections/" + collection + "/leader", props, cd, zkController.getZkClient());
       this.runLeaderDelay = runLeaderDelay;
     }
 
@@ -182,7 +183,7 @@ public class LeaderElectionTest extends SolrTestCaseJ4 {
     private void setupOnConnect() throws InterruptedException, KeeperException,
         IOException {
       assertNotNull(es);
-      TestLeaderElectionContext context = new TestLeaderElectionContext(es.elector, shard, "collection1", nodeName, replica, es.zkController, runLeaderDelay);
+      TestLeaderElectionContext context = new TestLeaderElectionContext(es.elector, shard, "collection1", nodeName, replica, es.zkController, runLeaderDelay, null);
       es.elector.setup(context);
       // nocommit - we have to get the seq another way, now returns if become leader first try
       //seq = es.elector.joinElection(context, false);
diff --git a/solr/core/src/test/org/apache/solr/cloud/MissingSegmentRecoveryTest.java b/solr/core/src/test/org/apache/solr/cloud/MissingSegmentRecoveryTest.java
index c0907f2..addc405 100644
--- a/solr/core/src/test/org/apache/solr/cloud/MissingSegmentRecoveryTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/MissingSegmentRecoveryTest.java
@@ -38,11 +38,13 @@ import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 @Slow
+@Ignore // nocommit - this feature needs a little work
 public class MissingSegmentRecoveryTest extends SolrCloudTestCase {
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
diff --git a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
index 436b3b8..d753695 100644
--- a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
@@ -231,7 +231,7 @@ public class OverseerTest extends SolrTestCaseJ4 {
                   "overseer"));
           Replica replica = new Replica(coreName, props.getProperties(), collection, shardId, zkStateReader);
           ShardLeaderElectionContextBase ctx = new ShardLeaderElectionContextBase(
-              nodeName + "_" + coreName, shardId, collection, replica,
+              nodeName + "_" + coreName, shardId, collection, replica, null,
               zkStateReader.getZkClient());
           elector.setup(ctx);
           electionContext.put(coreName, ctx);
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestDistributedMap.java b/solr/core/src/test/org/apache/solr/cloud/TestDistributedMap.java
index ccd9f47..eef9c1f 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestDistributedMap.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestDistributedMap.java
@@ -59,7 +59,7 @@ public class TestDistributedMap extends SolrTestCaseJ4 {
     String path = getAndMakeInitialPath(zkClient);
     DistributedMap map = createMap(zkClient, path);
     assertFalse(zkClient.exists(path + "/" + DistributedMap.PREFIX + "foo"));
-    map.put("foo", new byte[0]);
+    map.put("foo", new byte[0], CreateMode.PERSISTENT);
     assertTrue(zkClient.exists(path + "/" + DistributedMap.PREFIX + "foo"));
   }
 
@@ -106,9 +106,9 @@ public class TestDistributedMap extends SolrTestCaseJ4 {
     assertEquals(0, map.size());
     map.remove("bar");
     assertEquals(0, map.size());
-    map.put("foo", new byte[0]);
+    map.put("foo", new byte[0], CreateMode.PERSISTENT);
     assertEquals(1, map.size());
-    map.put("foo2", new byte[0]);
+    map.put("foo2", new byte[0], CreateMode.PERSISTENT);
     assertEquals(2, map.size());
     map.remove("foo");
     assertEquals(1, map.size());
@@ -143,11 +143,11 @@ public class TestDistributedMap extends SolrTestCaseJ4 {
     String path = getAndMakeInitialPath(zkClient);
     DistributedMap map = createMap(zkClient, path);
     assertEquals(0, map.keys().size());
-    map.put("foo", new byte[0]);
+    map.put("foo", new byte[0], CreateMode.PERSISTENT);
     assertTrue(map.keys().contains("foo"));
     assertEquals(1, map.keys().size());
 
-    map.put("bar", new byte[0]);
+    map.put("bar", new byte[0], CreateMode.PERSISTENT);
     assertTrue(map.keys().contains("bar"));
     assertTrue(map.keys().contains("foo"));
     assertEquals(2, map.keys().size());
@@ -164,8 +164,8 @@ public class TestDistributedMap extends SolrTestCaseJ4 {
     DistributedMap map = createMap(zkClient, path);
     map.clear();
     assertEquals(0, map.size());
-    map.put("foo", new byte[0]);
-    map.put("bar", new byte[0]);
+    map.put("foo", new byte[0], CreateMode.PERSISTENT);
+    map.put("bar", new byte[0], CreateMode.PERSISTENT);
     assertEquals(2, map.size());
     map.clear();
     assertEquals(0, map.size());
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestSizeLimitedDistributedMap.java b/solr/core/src/test/org/apache/solr/cloud/TestSizeLimitedDistributedMap.java
index a210807..c491fb0 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestSizeLimitedDistributedMap.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestSizeLimitedDistributedMap.java
@@ -22,6 +22,7 @@ import java.util.LinkedList;
 import java.util.List;
 import java.util.Set;
 import org.apache.solr.common.cloud.SolrZkClient;
+import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.junit.BeforeClass;
 
@@ -41,7 +42,7 @@ public class TestSizeLimitedDistributedMap extends TestDistributedMap {
     String path = getAndMakeInitialPath(zkClient);
     DistributedMap map = new SizeLimitedDistributedMap(zkClient, path, numResponsesToStore, (element) -> deletedItems.add(element));
     for (int i = 0; i < numResponsesToStore; i++) {
-      map.put("xyz_" + i, new byte[0]);
+      map.put("xyz_" + i, new byte[0], CreateMode.PERSISTENT);
       expectedKeys.add("xyz_" + i);
     }
 
@@ -49,7 +50,7 @@ public class TestSizeLimitedDistributedMap extends TestDistributedMap {
     assertTrue("Expected keys do not match", expectedKeys.containsAll(map.keys()));
     assertTrue("Expected keys do not match", map.keys().containsAll(expectedKeys));
     // add another to trigger cleanup
-    map.put("xyz_" + numResponsesToStore, new byte[0]);
+    map.put("xyz_" + numResponsesToStore, new byte[0], CreateMode.PERSISTENT);
     expectedKeys.add("xyz_" + numResponsesToStore);
     assertEquals("Distributed queue was not cleaned up",
             numResponsesToStore - (numResponsesToStore / 10) + 1, map.size());
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIDistributedZkTest.java
index 4e7b03d..b044358 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIDistributedZkTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIDistributedZkTest.java
@@ -289,6 +289,7 @@ public class CollectionsAPIDistributedZkTest extends SolrCloudTestCase {
   }
 
   @Test
+  @Ignore
   public void testDeleteNonExistentCollection() throws Exception {
 
     expectThrows(Exception.class, () -> {
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/CreateCollectionsIndexAndRestartTest.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/CreateCollectionsIndexAndRestartTest.java
index 62bd7b5..c689514 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/CreateCollectionsIndexAndRestartTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/CreateCollectionsIndexAndRestartTest.java
@@ -16,7 +16,6 @@
  */
 package org.apache.solr.cloud.api.collections;
 
-import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.LuceneTestCase.Slow;
 import org.apache.solr.client.solrj.embedded.JettySolrRunner;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
@@ -36,7 +35,7 @@ import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 
 @Slow
-@LuceneTestCase.AwaitsFix(bugUrl = "This an experimental test class")
+//@LuceneTestCase.AwaitsFix(bugUrl = "This an experimental test class")
 public class CreateCollectionsIndexAndRestartTest extends SolrCloudTestCase {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
diff --git a/solr/server/etc/jetty-https.xml b/solr/server/etc/jetty-https.xml
index 331cb3d..c12b4f6 100644
--- a/solr/server/etc/jetty-https.xml
+++ b/solr/server/etc/jetty-https.xml
@@ -79,7 +79,7 @@
         <Set name="acceptQueueSize"><Property name="solr.jetty.https.acceptQueueSize" default="4096"/></Set>
         <Call name="addLifeCycleListener">
           <Arg>
-            <New class="org.apache.solr.servlet.SolrConnectorListener"/>
+            <New class="org.apache.solr.servlet.SolrLifcycleListener"/>
           </Arg>
         </Call>
       </New>
diff --git a/solr/server/etc/jetty-https8.xml b/solr/server/etc/jetty-https8.xml
index f937852..9116a36 100644
--- a/solr/server/etc/jetty-https8.xml
+++ b/solr/server/etc/jetty-https8.xml
@@ -62,6 +62,11 @@
         <Set name="idleTimeout"><Property name="solr.jetty.https.timeout" default="120000"/></Set>
         <Set name="acceptorPriorityDelta"><Property name="solr.jetty.ssl.acceptorPriorityDelta" default="0"/></Set>
         <Set name="acceptQueueSize"><Property name="solr.jetty.https.acceptQueueSize" default="0"/></Set>
+        <Call name="addLifeCycleListener">
+          <Arg>
+            <New class="org.apache.solr.servlet.SolrLifcycleListener"/>
+          </Arg>
+        </Call>
       </New>
     </Arg>
   </Call>
diff --git a/solr/server/etc/jetty.xml b/solr/server/etc/jetty.xml
index 438eb36..bfbf422 100644
--- a/solr/server/etc/jetty.xml
+++ b/solr/server/etc/jetty.xml
@@ -152,10 +152,6 @@
            <Set name="handlers">
              <Array type="org.eclipse.jetty.server.Handler">
                <Item>
-                 <New id="ShutdownHandler" class="org.apache.solr.servlet.SolrShutdownHandler">
-                 </New>
-               </Item>
-               <Item>
                  <New class="org.eclipse.jetty.server.handler.InetAccessHandler">
                    <Call name="include">
                      <Arg>
@@ -193,9 +189,9 @@
     <!-- extra options                                               -->
     <!-- =========================================================== -->
     <Set name="stopAtShutdown">true</Set>
-    <Set name="stopTimeout">500</Set>
+    <Set name="stopTimeout">5000</Set>
     <Set name="dumpAfterStart">false</Set>
-    <Set name="dumpBeforeStop">true</Set>
+    <Set name="dumpBeforeStop">false</Set>
 
     <Call name="addBean">
       <Arg>
diff --git a/solr/server/resources/log4j2.xml b/solr/server/resources/log4j2.xml
index 6ac550a..411de14 100644
--- a/solr/server/resources/log4j2.xml
+++ b/solr/server/resources/log4j2.xml
@@ -23,7 +23,7 @@
     <Console name="STDOUT" target="SYSTEM_OUT">
       <PatternLayout>
         <Pattern>
-          %maxLen{%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p (%t) [%X{collection} %X{shard} %X{replica} %X{core}] %c{1.} %m%notEmpty{ =>%ex{short}}}{10240}%n
+          %maxLen{%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p (%t) [%X{node_name} %X{core}] %c{1.} %m%notEmpty{ =>%ex{short}}}{10240}%n
         </Pattern>
       </PatternLayout>
     </Console>
@@ -34,14 +34,14 @@
         filePattern="${sys:solr.log.dir}/solr.log.%i" >
       <PatternLayout>
         <Pattern>
-          %maxLen{%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p (%t) [%X{collection} %X{shard} %X{replica} %X{core}] %c{1.} %m%notEmpty{ =>%ex{short}}}{10240}%n
+          %maxLen{%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p (%t) [%X{node_name} %X{core}] %c{1.} %m%notEmpty{ =>%ex{short}}}{10240}%n
         </Pattern>
       </PatternLayout>
       <Policies>
         <OnStartupTriggeringPolicy />
-        <SizeBasedTriggeringPolicy size="32 MB"/>
+        <SizeBasedTriggeringPolicy size="64 MB"/>
       </Policies>
-      <DefaultRolloverStrategy max="10"/>
+      <DefaultRolloverStrategy max="20"/>
     </RollingRandomAccessFile>
 
     <RollingRandomAccessFile
@@ -50,7 +50,7 @@
         filePattern="${sys:solr.log.dir}/solr_slow_requests.log.%i" >
       <PatternLayout>
         <Pattern>
-          %maxLen{%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p (%t) [%X{collection} %X{shard} %X{replica} %X{core}] %c{1.} %m%notEmpty{ =>%ex{short}}}{10240}%n
+          %maxLen{%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p (%t) [%X{node_name} %X{core}] %c{1.} %m%notEmpty{ =>%ex{short}}}{10240}%n
         </Pattern>
       </PatternLayout>
       <Policies>
@@ -69,6 +69,8 @@
     <AsyncLogger name="org.apache.hadoop" level="WARN"/>
     <AsyncLogger name="org.apache.solr.update.LoggingInfoStream" level="OFF"/>
     <AsyncLogger name="org.apache.zookeeper" level="WARN"/>
+    <AsyncLogger name="org.apache.zookeeper.ClientCnxn" level="ERROR"/>
+    <AsyncLogger name="org.apache.zookeeper.server.ZooKeeperCriticalThread" level="OFF"/>
     <AsyncLogger name="org.apache.solr.core.SolrCore.SlowRequest" level="INFO" additivity="false">
       <AppenderRef ref="SlowLogFile"/>
     </AsyncLogger>
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/Http2SolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/Http2SolrClient.java
index 945fb62..3293bf4 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/Http2SolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/Http2SolrClient.java
@@ -83,6 +83,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.lang.invoke.MethodHandles;
+import java.lang.management.ManagementFactory;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URL;
@@ -116,6 +117,9 @@ import java.util.concurrent.TimeoutException;
  * @lucene.experimental
  */
 public class Http2SolrClient extends SolrClient {
+
+  public static final int PROC_COUNT = ManagementFactory.getOperatingSystemMXBean().getAvailableProcessors();
+
   public static final String REQ_PRINCIPAL_KEY = "solr-req-principal";
 
   private static volatile SSLConfig defaultSSLConfig;
@@ -218,9 +222,11 @@ public class Http2SolrClient extends SolrClient {
       ssl = true;
     }
     // nocommit - look at config again as well
-    int minThreads = Integer.getInteger("solr.minHttp2ClientThreads", 6);
+    int minThreads = Integer.getInteger("solr.minHttp2ClientThreads", PROC_COUNT);
+
+    minThreads = Math.min( builder.maxThreadPoolSize, minThreads);
     httpClientExecutor = new SolrQueuedThreadPool("http2Client", builder.maxThreadPoolSize, minThreads,
-        this.headers != null && this.headers.containsKey(QoSParams.REQUEST_SOURCE) && this.headers.get(QoSParams.REQUEST_SOURCE).equals(QoSParams.INTERNAL) ? 3000 : 5000,
+        this.headers != null && this.headers.containsKey(QoSParams.REQUEST_SOURCE) && this.headers.get(QoSParams.REQUEST_SOURCE).equals(QoSParams.INTERNAL) ? 1000 : 1000,
         null, -1, null);
     httpClientExecutor.setLowThreadsThreshold(-1);
 
@@ -470,7 +476,7 @@ public class Http2SolrClient extends SolrClient {
 
             } catch (Exception e) {
               if (SolrException.getRootCause(e) != CANCELLED_EXCEPTION) {
-                asyncListener.onFailure(e, 500);
+                asyncListener.onFailure(e, e instanceof  SolrException ? ((SolrException) e).code() : 500);
               }
             } finally {
               arrived = true;
@@ -890,15 +896,15 @@ public class Http2SolrClient extends SolrClient {
               log.warn("", e1);
             }
           }
-          throw new RemoteSolrException(serverBaseUrl, 527, msg, null);
+          throw new RemoteSolrException(serverBaseUrl, -1, msg, null);
         }
       }
 
       NamedList<Object> rsp;
-      int httpStatus = 527;
+      int httpStatus = -1;
 
       try {
-        httpStatus = listener.get(10, TimeUnit.SECONDS).getStatus();
+        httpStatus = response.getStatus();
       } catch (Exception e) {
         log.warn("", e);
       }
@@ -1101,7 +1107,7 @@ public class Http2SolrClient extends SolrClient {
 
   public static class Builder {
 
-    public int maxThreadPoolSize = Integer.getInteger("solr.maxHttp2ClientThreads", 512);
+    public int maxThreadPoolSize = Integer.getInteger("solr.maxHttp2ClientThreads", Math.max(7, PROC_COUNT * 2));
     public int maxRequestsQueuedPerDestination = 1600;
     private Http2SolrClient http2SolrClient;
     private SSLConfig sslConfig = defaultSSLConfig;
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java
index 756f239..0f750b6e 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java
@@ -81,7 +81,7 @@ public class LBHttp2SolrClient extends LBSolrClient {
 
   public LBHttp2SolrClient(String... baseSolrUrls) {
     super(Arrays.asList(baseSolrUrls));
-
+    // nocommit - should only be internal for us
     this.httpClient = new Http2SolrClient.Builder().markInternalRequest()
         // .withResponseParser(responseParser) // nocommit
         // .allowCompression(compression) // nocommit
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttpSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttpSolrClient.java
index 6b1183b..dba11bd 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttpSolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttpSolrClient.java
@@ -184,6 +184,7 @@ public class LBHttpSolrClient extends LBSolrClient {
     SolrClient client;
     if (http2SolrClientBuilder != null) {
       synchronized (this) {
+        // nocommit - should only be internal for us
         http2SolrClientBuilder
                 .withBaseUrl(server)
                 .markInternalRequest()
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java
index e286fe0..c24c9b1 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java
@@ -65,7 +65,7 @@ public abstract class LBSolrClient extends SolrClient {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   // defaults
-  protected static final Set<Integer> RETRY_CODES = new HashSet<>(Arrays.asList(404, 403, 503, 500));
+  protected static final Set<Integer> RETRY_CODES = new HashSet<>(Arrays.asList(403, 503, 500));
   private static final int CHECK_INTERVAL = 60 * 1000; //1 minute between checks
   private static final int NONSTANDARD_PING_LIMIT = 5;  // number of times we'll ping dead servers not in the server list
   public static final ServerWrapper[] EMPTY_SERVER_WRAPPER = new ServerWrapper[0];
@@ -759,20 +759,8 @@ public abstract class LBSolrClient extends SolrClient {
   public void close() {
     this.closed = true;
 
-//    ScheduledThreadPoolExecutor aexec = aliveCheckExecutor;
-//    if (aexec != null) {
-//      aliveCheckExecutor.shutdown();
-//      try {
-//        boolean success = aliveCheckExecutor.awaitTermination(1, TimeUnit.SECONDS);
-//        if (!success) {
-//          aliveCheckExecutor.shutdownNow();
-//        }
-//      } catch (InterruptedException e) {
-//        ParWork.propagateInterrupt(e);
-//      }
-
     if (aliveCheckExecutor != null) {
-      aliveCheckExecutor.shutdown();
+      aliveCheckExecutor.shutdownNow();
     }
     assert ObjectReleaseTracker.release(this);
   }
diff --git a/solr/solrj/src/java/org/apache/solr/common/ParWork.java b/solr/solrj/src/java/org/apache/solr/common/ParWork.java
index 0627e65..718fed8 100644
--- a/solr/solrj/src/java/org/apache/solr/common/ParWork.java
+++ b/solr/solrj/src/java/org/apache/solr/common/ParWork.java
@@ -73,15 +73,17 @@ public class ParWork implements Closeable {
 
   private static volatile ParWorkExecutor EXEC;
 
-  // pretty much don't use it
   public static ParWorkExecutor getRootSharedExecutor() {
     if (EXEC == null) {
       synchronized (ParWork.class) {
         if (EXEC == null) {
-          EXEC = (ParWorkExecutor) getParExecutorService("RootExec",
-              Integer.getInteger("solr.rootSharedThreadPoolCoreSize", 15), Integer.MAX_VALUE, 1000,
+          EXEC = (ParWorkExecutor) getParExecutorService("Root",
+              Integer.getInteger("solr.rootSharedThreadPoolCoreSize", 32), Integer.MAX_VALUE, 1000,
               new SynchronousQueue());
           ((ParWorkExecutor)EXEC).enableCloseLock();
+          for (int i = 0; i < 16; i++) {
+            EXEC.submit(() -> {});
+          }
         }
       }
     }
@@ -89,10 +91,12 @@ public class ParWork implements Closeable {
   }
 
   public static void shutdownParWorkExecutor() {
-    try {
-      shutdownParWorkExecutor(EXEC, true);
-    } finally {
-      EXEC = null;
+    synchronized (ParWork.class) {
+      try {
+        shutdownParWorkExecutor(EXEC, true);
+      } finally {
+        EXEC = null;
+      }
     }
   }
 
@@ -496,7 +500,7 @@ public class ParWork implements Closeable {
         Integer minThreads;
         Integer maxThreads;
         minThreads = 4;
-        maxThreads = PROC_COUNT;
+        maxThreads = PROC_COUNT / 2;
         exec = getExecutorService(Math.max(minThreads, maxThreads)); // keep alive directly affects how long a worker might
        // ((PerThreadExecService)exec).closeLock(true);
         // be stuck in poll without an enqueue on shutdown
@@ -524,11 +528,10 @@ public class ParWork implements Closeable {
   }
 
   private void handleObject(AtomicReference<Throwable> exception, final TimeTracker workUnitTracker, ParObject ob) {
-    if (log.isDebugEnabled()) {
-      log.debug(
+    if (log.isTraceEnabled()) log.trace(
           "handleObject(AtomicReference<Throwable> exception={}, CloseTimeTracker workUnitTracker={}, Object object={}) - start",
           exception, workUnitTracker, ob.object);
-    }
+
     Object object = ob.object;
 
     Object returnObject = null;
@@ -592,9 +595,7 @@ public class ParWork implements Closeable {
       assert subTracker.doneClose(returnObject instanceof String ? (String) returnObject : (returnObject == null ? "" : returnObject.getClass().getName()));
     }
 
-    if (log.isDebugEnabled()) {
-      log.debug("handleObject(AtomicReference<Throwable>, CloseTimeTracker, List<Callable<Object>>, Object) - end");
-    }
+    if (log.isTraceEnabled()) log.trace("handleObject(AtomicReference<Throwable>, CloseTimeTracker, List<Callable<Object>>, Object) - end");
   }
 
   /**
diff --git a/solr/solrj/src/java/org/apache/solr/common/ParWorkExecutor.java b/solr/solrj/src/java/org/apache/solr/common/ParWorkExecutor.java
index dc1890f..48581bc 100644
--- a/solr/solrj/src/java/org/apache/solr/common/ParWorkExecutor.java
+++ b/solr/solrj/src/java/org/apache/solr/common/ParWorkExecutor.java
@@ -17,7 +17,6 @@
 package org.apache.solr.common;
 
 import org.apache.solr.common.util.CloseTracker;
-import org.apache.solr.common.util.ExecutorUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -26,7 +25,6 @@ import java.util.List;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingDeque;
 import java.util.concurrent.RejectedExecutionException;
-import java.util.concurrent.SynchronousQueue;
 import java.util.concurrent.ThreadFactory;
 import java.util.concurrent.ThreadPoolExecutor;
 import java.util.concurrent.TimeUnit;
@@ -51,7 +49,7 @@ public class ParWorkExecutor extends ThreadPoolExecutor {
 
   public ParWorkExecutor(String name, int corePoolsSize, int maxPoolsSize,
       int keepalive, BlockingQueue<Runnable> workQueue) {
-    super(corePoolsSize, maxPoolsSize, keepalive, TimeUnit.MILLISECONDS, workQueue
+    super(corePoolsSize, Math.max(corePoolsSize, maxPoolsSize), keepalive, TimeUnit.MILLISECONDS, workQueue
     , new ParWorkThreadFactory(name));
     assert (closeTracker = new CloseTracker(false)) != null;
   }
diff --git a/solr/solrj/src/java/org/apache/solr/common/PerThreadExecService.java b/solr/solrj/src/java/org/apache/solr/common/PerThreadExecService.java
index f8ccd29..e1cbec1 100644
--- a/solr/solrj/src/java/org/apache/solr/common/PerThreadExecService.java
+++ b/solr/solrj/src/java/org/apache/solr/common/PerThreadExecService.java
@@ -36,8 +36,6 @@ public class PerThreadExecService extends AbstractExecutorService {
 
   private final AtomicInteger running = new AtomicInteger();
 
-  private final Object awaitTerminate = new Object();
-
   private CloseTracker closeTracker;
 
   private SysStats sysStats = ParWork.getSysStats();
@@ -112,14 +110,12 @@ public class PerThreadExecService extends AbstractExecutorService {
     TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME);
     while (running.get() > 0) {
       if (timeout.hasTimedOut()) {
-        log.warn("return before reaching termination, wait for {} {}, running={}", l, timeout, running);
+        log.error("return before reaching termination, wait for {} {}, running={}", l, timeout, running);
         return false;
       }
 
       // System.out.println("WAIT : " + workQueue.size() + " " + available.getQueueLength() + " " + workQueue.toString());
-      synchronized (awaitTerminate) {
-        awaitTerminate.wait(500);
-      }
+      Thread.sleep(250);
     }
 
     if (isShutdown()) {
@@ -142,33 +138,35 @@ public class PerThreadExecService extends AbstractExecutorService {
         try {
           available.acquire();
         } catch (InterruptedException e) {
+          running.decrementAndGet();
           throw new RejectedExecutionException("Interrupted");
         }
       }
       try {
         service.submit(() -> {
           runIt(runnable, noCallerRunsAvailableLimit, false);
-          if (noCallerRunsAvailableLimit) {
-            available.release();
-          }
         });
       } catch (Exception e) {
         log.error("", e);
-        running.decrementAndGet();
-        synchronized (awaitTerminate) {
-          awaitTerminate.notifyAll();
+        if (noCallerRunsAvailableLimit) {
+          available.release();
         }
+        running.decrementAndGet();
+        throw e;
       }
       return;
     }
 
-    if (!checkLoad()) {
-      runIt(runnable, false, false);
-      return;
+    try {
+      available.acquire();
+    } catch (InterruptedException e) {
+      running.decrementAndGet();
+      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
     }
 
-    if (!available.tryAcquire()) {
-      runIt(runnable, false, false);
+
+    if (!noCallerRunsAllowed && checkLoad()) {
+      runIt(runnable, true, false);
       return;
     }
 
@@ -176,10 +174,13 @@ public class PerThreadExecService extends AbstractExecutorService {
     try {
       service.submit(() -> runIt(finalRunnable, true, false));
     } catch (Exception e) {
-      running.decrementAndGet();
-      synchronized (awaitTerminate) {
-        awaitTerminate.notifyAll();
+      log.error("Exception submitting", e);
+      try {
+        available.release();
+      } finally {
+        running.decrementAndGet();
       }
+      throw e;
     }
   }
 
@@ -192,12 +193,7 @@ public class PerThreadExecService extends AbstractExecutorService {
           available.release();
         }
       } finally {
-        if (!alreadyShutdown) {
-          running.decrementAndGet();
-          synchronized (awaitTerminate) {
-            awaitTerminate.notifyAll();
-          }
-        }
+        running.decrementAndGet();
       }
     }
   }
@@ -206,22 +202,20 @@ public class PerThreadExecService extends AbstractExecutorService {
     return maxSize;
   }
 
-  public boolean checkLoad() {
+  private boolean checkLoad() {
 
-    double ourLoad = ParWork.getSysStats().getTotalUsage();
-    if (ourLoad > SysStats.OUR_LOAD_HIGH) {
-      if (log.isDebugEnabled()) log.debug("Our cpu usage is too high, run in caller thread {}", ourLoad);
-      return false;
-    } else {
-      double sLoad = sysStats.getSystemLoad();
-      if (sLoad > 1) {
-        if (log.isDebugEnabled()) log.debug("System load is too high, run in caller thread {}", sLoad);
-        return false;
-      }
+    double sLoad = sysStats.getSystemLoad();
+
+    if (hiStateLoad(sLoad)) {
+      return true;
     }
-    return true;
+    return false;
   }
-  
+
+  private boolean hiStateLoad(double sLoad) {
+    return sLoad > 0.8d && running.get() > 3;
+  }
+
   public void closeLock(boolean lock) {
     if (lock) {
       closeTracker.enableCloseLock();
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java b/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java
index 86c6538..47d7608 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java
@@ -59,6 +59,8 @@ public class Replica extends ZkNodeProps {
      * full replication or finding out things are already in sync.
      */
     RECOVERING,
+
+    BUFFERING,
     
     /**
      * Recovery attempts have not worked, something is not right.
@@ -87,6 +89,8 @@ public class Replica extends ZkNodeProps {
         return State.ACTIVE;
       } else if (shortState.equals("r")) {
         return State.RECOVERING;
+      } else if (shortState.equals("b")) {
+        return State.BUFFERING;
       } else if (shortState.equals("d")) {
         return State.DOWN;
       } else if (shortState.equals("f")) {
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZooKeeper.java b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZooKeeper.java
index 0546b05..f3fc018 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZooKeeper.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZooKeeper.java
@@ -119,7 +119,11 @@ public class SolrZooKeeper extends ZooKeeper {
     } catch (Exception e) {
       log.warn("Exception closing zookeeper client", e);
     }
-
+//    try {
+//      super.close();
+//    } catch (InterruptedException e) {
+//      e.printStackTrace();
+//    }
   }
 
 }
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
index 48c3329..d447d78 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
@@ -62,6 +62,7 @@ import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.common.util.ObjectReleaseTracker;
 import org.apache.solr.common.util.Pair;
 import org.apache.solr.common.util.Utils;
+import org.apache.solr.logging.MDCLoggingContext;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.KeeperException.NoNodeException;
 import org.apache.zookeeper.WatchedEvent;
@@ -124,7 +125,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
   public static final String COLLECTION_PROPS_ZKNODE = "collectionprops.json";
   public static final String REJOIN_AT_HEAD_PROP = "rejoinAtHead";
   public static final String SOLR_SECURITY_CONF_PATH = "/security.json";
-  public static final String SOLR_AUTOSCALING_CONF_PATH = "/autoscaling.json";
   public static final String SOLR_PKGS_PATH = "/packages.json";
 
   public static final String DEFAULT_SHARD_PREFERENCES = "defaultShardPreferences";
@@ -211,8 +211,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
   private final ConcurrentHashMap<String, CollectionWatch<DocCollectionWatcher>> collectionWatches = new ConcurrentHashMap<>(32, 0.75f, 3);
 
-  private final ConcurrentHashMap<String, ReentrantLock> collectionLocks = new ConcurrentHashMap<>(32, 0.75f, 3);
-
   private final Map<String,StateWatcher> stateWatchersMap = new ConcurrentHashMap<>(32, 0.75f, 3);
 
   // named this observers so there's less confusion between CollectionPropsWatcher map and the PropsWatcher map.
@@ -229,6 +227,7 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
   private static final long LAZY_CACHE_TIME = TimeUnit.NANOSECONDS.convert(15000, TimeUnit.MILLISECONDS); // nocommit
 
   private volatile Future<?> collectionPropsCacheCleaner; // only kept to identify if the cleaner has already been started.
+  private volatile String node = null;
 
   public static interface CollectionRemoved {
     void removed(String collection);
@@ -369,15 +368,9 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       Collection<String> safeCopy = new ArrayList<>(watchedCollectionStates.keySet());
       Set<String> updatedCollections = new HashSet<>();
       for (String coll : safeCopy) {
-        ReentrantLock lock = collectionLocks.get(coll);
-        if (lock != null) lock.lock();
-        try {
-          DocCollection newState = fetchCollectionState(coll, null);
-          if (updateWatchedCollection(coll, newState)) {
-            updatedCollections.add(coll);
-          }
-        } finally {
-          if (lock != null) lock.unlock();
+        DocCollection newState = fetchCollectionState(coll, null);
+        if (updateWatchedCollection(coll, newState)) {
+          updatedCollections.add(coll);
         }
       }
       constructState(updatedCollections);
@@ -393,8 +386,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
 
   public void forciblyRefreshClusterStateSlow(String name) {
-    ReentrantLock lock = collectionLocks.get(name);
-    if (lock != null) lock.lock();
     try {
       refreshCollectionList(null);
       refreshLiveNodes(null);
@@ -416,8 +407,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
     } catch (InterruptedException e) {
       ParWork.propagateInterrupt(e);
       throw new SolrException(ErrorCode.SERVER_ERROR, e);
-    } finally {
-      if (lock != null) lock.unlock();
     }
   }
 
@@ -429,35 +418,28 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
   }
 
   public Integer compareStateVersions(String coll, int version) {
-    DocCollection collection = null;
-    ReentrantLock lock = collectionLocks.get(coll);
-    if (lock != null) lock.lock();
-    try {
-      collection = clusterState.getCollectionOrNull(coll);
-      if (collection == null) return null;
-      if (collection.getZNodeVersion() < version) {
-        if (log.isDebugEnabled()) {
-          log.debug("Server older than client {}<{}", collection.getZNodeVersion(), version);
-        }
-        DocCollection nu = getCollectionLive(this, coll);
-        if (nu == null) return -3;
-        if (nu.getZNodeVersion() > collection.getZNodeVersion()) {
-          if (updateWatchedCollection(coll, nu)) {
-            constructState(Collections.singleton(coll));
-          }
-          collection = nu;
+    DocCollection collection = clusterState.getCollectionOrNull(coll);
+    if (collection == null) return null;
+    if (collection.getZNodeVersion() < version) {
+      if (log.isDebugEnabled()) {
+        log.debug("Server older than client {}<{}", collection.getZNodeVersion(), version);
+      }
+      DocCollection nu = getCollectionLive(this, coll);
+      if (nu == null) return -3;
+      if (nu.getZNodeVersion() > collection.getZNodeVersion()) {
+        if (updateWatchedCollection(coll, nu)) {
+          constructState(Collections.singleton(coll));
         }
+        collection = nu;
       }
+    }
 
-      if (collection.getZNodeVersion() == version) {
-        return null;
-      }
+    if (collection.getZNodeVersion() == version) {
+      return null;
+    }
 
-      if (log.isDebugEnabled()) {
-        log.debug("Wrong version from client [{}]!=[{}]", version, collection.getZNodeVersion());
-      }
-    } finally {
-      if (lock != null) lock.unlock();
+    if (log.isDebugEnabled()) {
+      log.debug("Wrong version from client [{}]!=[{}]", version, collection.getZNodeVersion());
     }
 
     return collection.getZNodeVersion();
@@ -618,14 +600,14 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       Map<String,ClusterState.CollectionRef> result = new LinkedHashMap<>();
 
       // Add collections
-      for (Map.Entry<String,DocCollection> entry : watchedCollectionStates.entrySet()) {
-        result.put(entry.getKey(), new ClusterState.CollectionRef(entry.getValue()));
-      }
+      watchedCollectionStates.forEach((s, slices) -> {
+        result.put(s, new ClusterState.CollectionRef(slices));
+      });
 
       // Finally, add any lazy collections that aren't already accounted for.
-      for (Map.Entry<String,LazyCollectionRef> entry : lazyCollectionStates.entrySet()) {
-        result.putIfAbsent(entry.getKey(), entry.getValue());
-      }
+      lazyCollectionStates.forEach((s, lazyCollectionRef) -> {
+        result.putIfAbsent(s, lazyCollectionRef);
+      });
 
       this.clusterState = new ClusterState(result, -1);
 
@@ -789,6 +771,9 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
           lastUpdateTime = System.nanoTime();
         }
       }
+      if (cachedDocCollection == null) {
+        log.error("cached collection is null");
+      }
       return cachedDocCollection;
     }
 
@@ -966,7 +951,10 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
   public boolean isNodeLive(String node) {
     return getLiveNodes().contains(node);
+  }
 
+  public void setNode(String node) {
+    this.node = node;
   }
 
   /**
@@ -987,33 +975,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
    * Get shard leader properties, with retry if none exist.
    */
   public Replica getLeaderRetry(String collection, String shard, int timeout, boolean mustBeLive) throws InterruptedException, TimeoutException {
-
-    DocCollection coll = getClusterState().getCollectionOrNull(collection);
-
-    if (coll != null) {
-      Slice slice = coll.getSlice(shard);
-      if (slice != null) {
-        Replica leader = slice.getLeader();
-        boolean valid;
-        try {
-          valid = mustBeLive ? isNodeLive(leader.getNodeName()) || zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + slice.getName() + "/leader") : isNodeLive(leader.getNodeName());
-        } catch (KeeperException e) {
-          throw new SolrException(ErrorCode.SERVER_ERROR, e);
-        } catch (InterruptedException e) {
-          throw new SolrException(ErrorCode.SERVER_ERROR, e);
-        }
-        if (leader != null && leader.getState() == Replica.State.ACTIVE && valid) {
-          return leader;
-        }
-        Collection<Replica> replicas = slice.getReplicas();
-        for (Replica replica : replicas) {
-          if ("true".equals(replica.getProperty(LEADER_PROP)) && replica.getState() == Replica.State.ACTIVE && valid) {
-            return replica;
-          }
-        }
-      }
-    }
-
     AtomicReference<Replica> returnLeader = new AtomicReference<>();
     try {
       waitForState(collection, timeout, TimeUnit.MILLISECONDS, (n, c) -> {
@@ -1021,37 +982,40 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
           return false;
         Slice slice = c.getSlice(shard);
         if (slice == null) return false;
+        Replica zkLeader = null;
         Replica leader = slice.getLeader();
-
         if (leader != null && leader.getState() == Replica.State.ACTIVE) {
-          boolean valid = false;
-          try {
-            valid = mustBeLive ?  isNodeLive(leader.getNodeName()) || zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + slice.getName()  + "/leader") : isNodeLive(leader.getNodeName());
-          } catch (KeeperException e) {
-            throw new SolrException(ErrorCode.SERVER_ERROR, e);
-          } catch (InterruptedException e) {
-            throw new SolrException(ErrorCode.SERVER_ERROR, e);
-          }
-          if (valid) {
+          if (isNodeLive(leader.getNodeName())) {
             returnLeader.set(leader);
             return true;
           }
+
+          if (!mustBeLive) {
+            if (zkLeader == null) {
+              zkLeader = getLeaderProps(collection, shard);
+            }
+            if (zkLeader != null && zkLeader.getName().equals(leader.getName())) {
+              returnLeader.set(leader);
+              return true;
+            }
+          }
         }
         Collection<Replica> replicas = slice.getReplicas();
         for (Replica replica : replicas) {
           if ("true".equals(replica.getProperty(LEADER_PROP)) && replica.getState() == Replica.State.ACTIVE) {
-            boolean valid = false;
-            try {
-              valid = mustBeLive ?  zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + slice.getName()  + "/leader") : isNodeLive(leader.getNodeName());
-            } catch (KeeperException e) {
-              throw new SolrException(ErrorCode.SERVER_ERROR, e);
-            } catch (InterruptedException e) {
-              throw new SolrException(ErrorCode.SERVER_ERROR, e);
-            }
-            if (valid) {
+            if (isNodeLive(replica.getNodeName())) {
               returnLeader.set(replica);
               return true;
             }
+            if (!mustBeLive) {
+              if (zkLeader == null) {
+                zkLeader = getLeaderProps(collection, shard);
+              }
+              if (zkLeader != null && zkLeader.getName().equals(replica.getName())) {
+                returnLeader.set(replica);
+                return true;
+              }
+            }
           }
         }
 
@@ -1060,11 +1024,30 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
     } catch (TimeoutException e) {
       throw new TimeoutException("No registered leader was found after waiting for "
           + timeout + "ms " + ", collection: " + collection + " slice: " + shard + " saw state=" + clusterState.getCollectionOrNull(collection)
-          + " with live_nodes=" + liveNodes);
+          + " with live_nodes=" + liveNodes + " zkLeaderNode=" + getLeaderProps(collection, shard));
     }
     return returnLeader.get();
   }
 
+  public Replica getLeaderProps(final String collection, final String slice) {
+
+    try {
+      byte[] data = zkClient.getData(ZkStateReader.getShardLeadersPath(collection, slice), null, null);
+      ZkCoreNodeProps leaderProps = new ZkCoreNodeProps(ZkNodeProps.load(data));
+      String name = leaderProps.getNodeProps().getStr(ZkStateReader.CORE_NAME_PROP);
+      leaderProps.getNodeProps().getProperties().remove(ZkStateReader.CORE_NAME_PROP);
+      // nocommit - right key for leader name?
+      return new Replica(name, leaderProps.getNodeProps().getProperties(), collection, slice, this);
+
+    } catch (KeeperException.NoNodeException e) {
+      return null;
+    } catch (Exception e) {
+      SolrZkClient.checkInterrupted(e);
+      throw new SolrException(ErrorCode.SERVER_ERROR, e);
+    }
+
+  }
+
   /**
    * Get path where shard leader properties live in zookeeper.
    */
@@ -1094,13 +1077,13 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
   }
 
   public List<Replica> getReplicaProps(String collection, String shardId, String thisCoreNodeName,
-                                               Replica.State mustMatchStateFilter, Replica.State mustNotMatchStateFilter) {
+                                               Replica.State mustMatchStateFilter, Replica.State mustMatchStateFilter2) {
     //TODO: We don't need all these getReplicaProps method overloading. Also, it's odd that the default is to return replicas of type TLOG and NRT only
-    return getReplicaProps(collection, shardId, thisCoreNodeName, mustMatchStateFilter, null, EnumSet.of(Replica.Type.TLOG, Replica.Type.NRT));
+    return getReplicaProps(collection, shardId, thisCoreNodeName, mustMatchStateFilter, mustMatchStateFilter2, EnumSet.of(Replica.Type.TLOG, Replica.Type.NRT));
   }
 
   public List<Replica> getReplicaProps(String collection, String shardId, String thisCoreNodeName,
-                                               Replica.State mustMatchStateFilter, Replica.State mustNotMatchStateFilter, final EnumSet<Replica.Type> acceptReplicaType) {
+                                               Replica.State mustMatchStateFilter, Replica.State mustMatchStateFilter2, final EnumSet<Replica.Type> acceptReplicaType) {
     assert thisCoreNodeName != null;
     ClusterState clusterState = this.clusterState;
     if (clusterState == null) {
@@ -1125,10 +1108,8 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       String coreNodeName = entry.getValue().getName();
 
       if (liveNodes.contains(nodeProps.getNodeName()) && !coreNodeName.equals(thisCoreNodeName)) {
-        if (mustMatchStateFilter == null || mustMatchStateFilter == nodeProps.getState()) {
-          if (mustNotMatchStateFilter == null || mustNotMatchStateFilter != nodeProps.getState()) {
-            nodes.add(nodeProps);
-          }
+        if (mustMatchStateFilter == null || (mustMatchStateFilter == nodeProps.getState() || mustMatchStateFilter2 == nodeProps.getState())) {
+          nodes.add(nodeProps);
         }
       }
     }
@@ -1399,26 +1380,23 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
       if (closed) return;
 
-      ReentrantLock lock = collectionLocks.get(coll);
-      if (lock != null) lock.lock();
-      try {
-//        if (!collectionWatches.containsKey(coll)) {
-//          // This collection is no longer interesting, stop watching.
-//          log.debug("Uninteresting collection {}", coll);
-//          return;
-//        }
-
-        Set<String> liveNodes = ZkStateReader.this.liveNodes;
-        if (log.isInfoEnabled()) {
-          log.info("A cluster state change: [{}] for collection [{}] has occurred - updating... (live nodes size: [{}])", event, coll, liveNodes.size());
-        }
+      if (node != null) {
+        MDCLoggingContext.setNode(node);
+      }
 
-        refreshAndWatch();
+      if (!collectionWatches.containsKey(coll)) {
+        // This collection is no longer interesting, stop watching.
+        log.debug("Uninteresting collection {}", coll);
+        return;
+      }
 
-      } finally {
-        if (lock != null) lock.unlock();
+      Set<String> liveNodes = ZkStateReader.this.liveNodes;
+      if (log.isInfoEnabled()) {
+        log.info("A cluster state change: [{}] for collection [{}] has occurred - updating... (live nodes size: [{}])", event, coll, liveNodes.size());
       }
 
+      refreshAndWatch();
+
     }
 
     /**
@@ -1456,8 +1434,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
           }
           if (log.isDebugEnabled()) log.debug("_statupdates event {}", event);
 
-          ReentrantLock lock = collectionLocks.get(coll);
-          if (lock != null) lock.lock();
           try {
 
             //            if (event.getType() == EventType.NodeDataChanged ||
@@ -1467,8 +1443,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
           } catch (Exception e) {
             log.error("Unwatched collection: [{}]", coll, e);
-          } finally {
-            if (lock != null) lock.unlock();
           }
         }
 
@@ -1587,17 +1561,21 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
             Set<String> liveNodes = ZkStateReader.this.liveNodes; // volatile read
             // Add collections
             for (Map.Entry<String,DocCollection> entry : watchedCollectionStates.entrySet()) {
-              if (!entry.getKey().equals(coll)) {
-                result.put(entry.getKey(), new ClusterState.CollectionRef(entry.getValue()));
-              }
+
             }
+            watchedCollectionStates.forEach((s, slices) -> {
+              if (!s.equals(coll)) {
+                result.put(s, new ClusterState.CollectionRef(slices));
+              }
+            });
 
             // Finally, add any lazy collections that aren't already accounted for.
-            for (Map.Entry<String,LazyCollectionRef> entry : lazyCollectionStates.entrySet()) {
-              if (!entry.getKey().equals(coll)) {
-                result.putIfAbsent(entry.getKey(), entry.getValue());
+            lazyCollectionStates.forEach((s, lazyCollectionRef) -> {
+              if (!s.equals(coll)) {
+                result.putIfAbsent(s, lazyCollectionRef);
               }
-            }
+
+            });
 
             ClusterState cs = new ClusterState(result, -2);
             if (log.isDebugEnabled()) log.debug("Set a new clusterstate based on update diff {}", cs);
@@ -1635,6 +1613,8 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
           work.collect("", () -> {
             try {
               zk.removeWatches(getCollectionSCNPath(coll), this, WatcherType.Any, true);
+            } catch (KeeperException.NoWatcherException e) {
+
             } catch (Exception e) {
               log.info("could not remove watch {} {}", e.getClass().getSimpleName(), e.getMessage());
             }
@@ -1643,6 +1623,8 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
             work.collect("", () -> {
               try {
                 zk.removeWatches(getCollectionStateUpdatesPath(coll), watcher, WatcherType.Any, true);
+              } catch (KeeperException.NoWatcherException e) {
+
               } catch (Exception e) {
                 log.info("could not remove watch {} {}", e.getClass().getSimpleName(), e.getMessage());
               }
@@ -1681,7 +1663,9 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       if (EventType.None.equals(event.getType())) {
         return;
       }
-
+      if (node != null) {
+        MDCLoggingContext.setNode(node);
+      }
       boolean expired = System.nanoTime() > watchUntilNs;
       if (!collectionPropsObservers.containsKey(coll) && expired) {
         // No one can be notified of the change, we can ignore it and "unset" the watch
@@ -1749,7 +1733,9 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       if (ZkStateReader.this.closed) {
         return;
       }
-
+      if (node != null) {
+        MDCLoggingContext.setNode(node);
+      }
       // session events are not change events, and do not remove the watcher
       if (EventType.None.equals(event.getType())) {
         return;
@@ -1790,6 +1776,9 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       if (EventType.None.equals(event.getType())) {
         return;
       }
+      if (node != null) {
+        MDCLoggingContext.setNode(node);
+      }
       if (event.getType() == EventType.NodeDataChanged) {
         if (log.isDebugEnabled()) {
           log.debug("A live node change: [{}], has occurred - updating... (previous live nodes size: [{}])", event, liveNodes.size());
@@ -1913,8 +1902,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
     if (reconstructState.get()) {
       StateWatcher sw = new StateWatcher(collection);
       stateWatchersMap.put(collection, sw);
-      ReentrantLock lock = new ReentrantLock(true);
-      collectionLocks.put(collection, lock);
       sw.refreshAndWatch();
       sw.watchStateUpdates();
     }
@@ -1940,29 +1927,24 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       throw new IllegalArgumentException("Collection cannot be null");
     }
     AtomicBoolean reconstructState = new AtomicBoolean(false);
-    ReentrantLock lock = collectionLocks.get(collection);
-    if (lock != null) lock.lock();
-    try {
-      collectionWatches.compute(collection, (k, v) -> {
-        if (v == null) return null;
-        v.coreRefCount.decrementAndGet();
-        if (v.canBeRemoved()) {
-          watchedCollectionStates.remove(collection);
-          collectionLocks.remove(collection);
-          IOUtils.closeQuietly(stateWatchersMap.remove(collection));
-          lazyCollectionStates.put(collection, new LazyCollectionRef(collection));
-          reconstructState.set(true);
-          return null;
-        }
-        return v;
-      });
 
-      if (reconstructState.get()) {
-        constructState(Collections.emptySet());
+    collectionWatches.compute(collection, (k, v) -> {
+      if (v.coreRefCount.get() > 0)
+        v.coreRefCount.decrementAndGet();
+      if (v.canBeRemoved()) {
+        watchedCollectionStates.remove(collection);
+        IOUtils.closeQuietly(stateWatchersMap.remove(collection));
+        lazyCollectionStates.put(collection, new LazyCollectionRef(collection));
+        reconstructState.set(true);
+        return null;
       }
-    } finally {
-      if (lock != null) lock.unlock();
+      return v;
+    });
+
+    if (reconstructState.get()) {
+      constructState(Collections.emptySet());
     }
+
   }
 
   /**
@@ -1991,9 +1973,10 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
     registerDocCollectionWatcher(collection, wrapper);
     registerLiveNodesListener(wrapper);
 
-//    DocCollection state = clusterState.getCollectionOrNull(collection);
-//
-//    removeCollectionStateWatcher(collection, stateWatcher);
+    DocCollection state = clusterState.getCollectionOrNull(collection);
+    if (stateWatcher.onStateChanged(liveNodes, state) == true) {
+      removeCollectionStateWatcher(collection, stateWatcher);
+    }
   }
 
   /**
@@ -2025,8 +2008,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
     if (watchSet.get()) {
       StateWatcher sw = new StateWatcher(collection);
       stateWatchersMap.put(collection, sw);
-      ReentrantLock lock = new ReentrantLock(true);
-      collectionLocks.put(collection, lock);
       sw.refreshAndWatch();
       sw.watchStateUpdates();
     }
@@ -2084,6 +2065,11 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
       // wait for the watcher predicate to return true, or time out
       if (!latch.await(wait, unit)) {
+        coll = clusterState.getCollectionOrNull(collection);
+        if (predicate.matches(liveNodes, coll)) {
+          return;
+        }
+
         throw new TimeoutException("Timeout waiting to see state for collection=" + collection + " :" + "live=" + liveNodes
                 + docCollection.get());
       }
@@ -2093,10 +2079,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
     }
   }
 
-  public void waitForCollection(String collection, long wait, TimeUnit unit) throws TimeoutException, InterruptedException {
-    waitForState(collection, wait, unit, (l, c) -> c != null);
-  }
-
   public void waitForActiveCollection(String collection, long wait, TimeUnit unit, int shards, int totalReplicas) {
     waitForActiveCollection(collection, wait, unit, shards, totalReplicas, false);
   }
@@ -2160,6 +2142,9 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
     try {
       // wait for the watcher predicate to return true, or time out
       if (!latch.await(wait, unit))
+        if (predicate.matches(liveNodes)) {
+          return;
+        }
         throw new TimeoutException("Timeout waiting for live nodes, currently they are: " + liveNodes);
 
     } finally {
@@ -2208,30 +2193,23 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
     AtomicBoolean reconstructState = new AtomicBoolean(false);
 
-    ReentrantLock lock = collectionLocks.get(collection);
-    if (lock != null) lock.lock();
-    try {
-      collectionWatches.compute(collection, (k, v) -> {
-        if (v == null) return null;
-        v.stateWatchers.remove(watcher);
-        if (v.canBeRemoved()) {
-          log.info("no longer watch collection {}", collection);
-          watchedCollectionStates.remove(collection);
-          lazyCollectionStates.put(collection, new LazyCollectionRef(collection));
-          collectionLocks.remove(collection);
-          StateWatcher stateWatcher = stateWatchersMap.remove(collection);
-          if (stateWatcher != null) {
-            IOUtils.closeQuietly(stateWatcher);
-          }
-          reconstructState.set(true);
-          return null;
+    collectionWatches.compute(collection, (k, v) -> {
+      if (v == null) return null;
+      v.stateWatchers.remove(watcher);
+      if (v.canBeRemoved()) {
+        log.info("no longer watch collection {}", collection);
+        watchedCollectionStates.remove(collection);
+        lazyCollectionStates.put(collection, new LazyCollectionRef(collection));
+        StateWatcher stateWatcher = stateWatchersMap.remove(collection);
+        if (stateWatcher != null) {
+          IOUtils.closeQuietly(stateWatcher);
         }
-        return v;
-      });
+        reconstructState.set(true);
+        return null;
+      }
+      return v;
+    });
 
-    } finally {
-      if (lock != null) lock.unlock();
-    }
   }
 
   /* package-private for testing */
@@ -2257,7 +2235,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       if (newState == null) {
         if (log.isDebugEnabled()) log.debug("Removing cached collection state for [{}]", coll);
         watchedCollectionStates.remove(coll);
-        collectionLocks.remove(coll);
         IOUtils.closeQuietly(stateWatchersMap.remove(coll));
         lazyCollectionStates.remove(coll);
         if (collectionRemoved != null) {
@@ -2369,6 +2346,9 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
     @Override
     public void run() {
+      if (node != null) {
+        MDCLoggingContext.setNode(node);
+      }
       List<DocCollectionWatcher> watchers = new ArrayList<>();
       synchronized (collectionWatches) {
         collectionWatches.compute(collection, (k, v) -> {
@@ -2517,6 +2497,9 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       if (EventType.None.equals(event.getType())) {
         return;
       }
+      if (node != null) {
+        MDCLoggingContext.setNode(node);
+      }
       try {
         log.debug("Aliases: updating");
 
@@ -2596,6 +2579,9 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
     @Override
     public void run() {
+      if (node != null) {
+        MDCLoggingContext.setNode(node);
+      }
       for (CollectionPropsWatcher watcher : watchers) {
         if (watcher.onStateChanged(collectionProperties)) {
           removeCollectionPropsWatcher(collection, watcher);
diff --git a/solr/solrj/src/java/org/apache/solr/common/util/SolrQueuedThreadPool.java b/solr/solrj/src/java/org/apache/solr/common/util/SolrQueuedThreadPool.java
index b379cb7..e8dc92b 100644
--- a/solr/solrj/src/java/org/apache/solr/common/util/SolrQueuedThreadPool.java
+++ b/solr/solrj/src/java/org/apache/solr/common/util/SolrQueuedThreadPool.java
@@ -725,11 +725,11 @@ public class SolrQueuedThreadPool extends ContainerLifeCycle implements ThreadFa
             idle = false;
 
             // run job
-            if (LOG.isDebugEnabled()) LOG.debug("run {} in {}", job, SolrQueuedThreadPool.this);
+            if (LOG.isTraceEnabled()) LOG.trace("run {} in {}", job, SolrQueuedThreadPool.this);
             runJob(job);
-            if (LOG.isDebugEnabled()) LOG.debug("ran {} in {}", job, SolrQueuedThreadPool.this);
+            if (LOG.isTraceEnabled()) LOG.trace("ran {} in {}", job, SolrQueuedThreadPool.this);
           } catch (InterruptedException e) {
-            if (LOG.isDebugEnabled()) LOG.debug("interrupted {} in {}", job, SolrQueuedThreadPool.this);
+            if (LOG.isTraceEnabled()) LOG.trace("interrupted {} in {}", job, SolrQueuedThreadPool.this);
           } catch (Throwable e) {
             LOG.warn("", e);
           } finally {
diff --git a/solr/solrj/src/java/org/apache/solr/common/util/SysStats.java b/solr/solrj/src/java/org/apache/solr/common/util/SysStats.java
index 05417d2..75a7a17 100644
--- a/solr/solrj/src/java/org/apache/solr/common/util/SysStats.java
+++ b/solr/solrj/src/java/org/apache/solr/common/util/SysStats.java
@@ -19,8 +19,8 @@ public class SysStats extends Thread {
     private static final Logger log = LoggerFactory
         .getLogger(MethodHandles.lookup().lookupClass());
 
-    public static final double OUR_LOAD_HIGH = 1.0;
-    public static final long REFRESH_INTERVAL = TimeUnit.NANOSECONDS.convert(5000, TimeUnit.MILLISECONDS);
+    public static final double OUR_LOAD_HIGH = 3.0;
+    public static final long REFRESH_INTERVAL = TimeUnit.NANOSECONDS.convert(2500, TimeUnit.MILLISECONDS);
     public static final int PROC_COUNT = ManagementFactory.getOperatingSystemMXBean().getAvailableProcessors();
     private final long refreshIntervalMs;
 
@@ -89,12 +89,12 @@ public class SysStats extends Thread {
                     threadTime.setLast(threadBean.getThreadCpuTime(threadTime.getId()));
                 }
 
-                double load =  ManagementFactory.getOperatingSystemMXBean().getSystemLoadAverage();
-                if (load < 0) {
-                    log.warn("SystemLoadAverage not supported on this JVM");
-                } else {
-                    sysLoad = load / (double) PROC_COUNT;
-                }
+//                double load =  ManagementFactory.getOperatingSystemMXBean().getSystemLoadAverage();
+//                if (load < 0) {
+//                    log.warn("SystemLoadAverage not supported on this JVM");
+//                } else {
+//                    sysLoad = load / (double) PROC_COUNT;
+//                }
 
             } else {
                 double load =  ManagementFactory.getOperatingSystemMXBean().getSystemLoadAverage();
@@ -162,6 +162,13 @@ public class SysStats extends Thread {
     }
 
     public double getSystemLoad() {
+        double sysLoad = -1;
+        double load = ManagementFactory.getOperatingSystemMXBean().getSystemLoadAverage();
+        if (load < 0) {
+            log.warn("SystemLoadAverage not supported on this JVM");
+        } else {
+            sysLoad = load / (double) PROC_COUNT;
+        }
         return sysLoad;
     }
 
diff --git a/solr/core/src/java/org/apache/solr/logging/MDCLoggingContext.java b/solr/solrj/src/java/org/apache/solr/logging/MDCLoggingContext.java
similarity index 53%
rename from solr/core/src/java/org/apache/solr/logging/MDCLoggingContext.java
rename to solr/solrj/src/java/org/apache/solr/logging/MDCLoggingContext.java
index c714644..cb94baf 100644
--- a/solr/core/src/java/org/apache/solr/logging/MDCLoggingContext.java
+++ b/solr/solrj/src/java/org/apache/solr/logging/MDCLoggingContext.java
@@ -16,12 +16,7 @@
  */
 package org.apache.solr.logging;
 
-import org.apache.solr.cloud.CloudDescriptor;
-import org.apache.solr.cloud.ZkController;
 import org.apache.solr.common.StringUtils;
-import org.apache.solr.core.CoreContainer;
-import org.apache.solr.core.CoreDescriptor;
-import org.apache.solr.core.SolrCore;
 import org.slf4j.MDC;
 
 import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
@@ -32,8 +27,7 @@ import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
 
 /**
  * Set's per thread context info for logging. Nested calls will use the top level parent for all context. The first
- * caller always owns the context until it calls {@link #clear()}. Always call {@link #setCore(SolrCore)} or
- * {@link #setCoreDescriptor(CoreContainer, CoreDescriptor)} and then {@link #clear()} in a finally block.
+ * caller always owns the context until it calls {@link #clear()}. Always call {@link #clear()} in a finally block.
  */
 public class MDCLoggingContext {
   public static final String TRACE_ID = "trace_id";
@@ -41,123 +35,58 @@ public class MDCLoggingContext {
   private static ThreadLocal<Integer> CALL_DEPTH = ThreadLocal.withInitial(() -> 0);
 
   public static void setCollection(String collection) {
-    if (collection != null) {
-      MDC.put(COLLECTION_PROP, "c:" + collection);
-    } else {
-      MDC.remove(COLLECTION_PROP);
-    }
+//    if (collection != null) {
+//      MDC.put(COLLECTION_PROP, "cn=" + collection);
+//    } else {
+//      MDC.remove(COLLECTION_PROP);
+//    }
   }
 
   public static void setTracerId(String traceId) {
     if (!StringUtils.isEmpty(traceId)) {
-      MDC.put(TRACE_ID, "t:" + traceId);
+      MDC.put(TRACE_ID, "t=" + traceId);
     } else {
       MDC.remove(TRACE_ID);
     }
   }
   
-  public static void setShard(String shard) {
-    if (shard != null) {
-      MDC.put(SHARD_ID_PROP, "s:" + shard);
-    } else {
-      MDC.remove(SHARD_ID_PROP);
-    }
-  }
-  
-  public static void setReplica(String replica) {
-    if (replica != null) {
-      MDC.put(REPLICA_PROP, "r:" + replica);
-    } else {
-      MDC.remove(REPLICA_PROP);
-    }
-  }
-  
   public static void setCoreName(String core) {
     if (core != null) {
-      MDC.put(CORE_NAME_PROP, "x:" + core);
+      MDC.put(CORE_NAME_PROP, "c=" + core);
     } else {
       MDC.remove(CORE_NAME_PROP);
     }
   }
   
-  public static void setNode(CoreContainer cc) {
-    if (cc != null) {
-      ZkController zk = cc.getZkController();
-      if (zk != null) {
-        setNode(zk.getNodeName());
-      }
-    }
-  }
-  
   // we allow the host to be set like this because it is the same for any thread
   // in the thread pool - we can't do this with the per core properties!
   public static void setNode(String node) {
-    int used = CALL_DEPTH.get();
-    if (used == 0) {
+//    int used = CALL_DEPTH.get();
+//    if (used == 0) {
       setNodeName(node);
-    }
+//    }
   }
   
   private static void setNodeName(String node) {
     if (node != null) {
-      MDC.put(NODE_NAME_PROP, "n:" + node);
+      MDC.put(NODE_NAME_PROP, "n=" + node);
     } else {
       MDC.remove(NODE_NAME_PROP);
     }
   }
 
   /**
-   * Sets multiple information from the params.
-   * REMEMBER TO CALL {@link #clear()} in a finally!
-   */
-  public static void setCore(SolrCore core) {
-    CoreContainer coreContainer = core == null ? null : core.getCoreContainer();
-    CoreDescriptor coreDescriptor = core == null ? null : core.getCoreDescriptor();
-    setCoreDescriptor(coreContainer, coreDescriptor);
-  }
-
-  /**
-   * Sets multiple information from the params.
-   * REMEMBER TO CALL {@link #clear()} in a finally!
-   */
-  public static void setCoreDescriptor(CoreContainer coreContainer, CoreDescriptor cd) {
-    setNode(coreContainer);
-
-    int callDepth = CALL_DEPTH.get();
-    CALL_DEPTH.set(callDepth + 1);
-    if (callDepth > 0) {
-      return;
-    }
-
-    if (cd != null) {
-
-      assert cd.getName() != null;
-      setCoreName(cd.getName());
-      
-      CloudDescriptor ccd = cd.getCloudDescriptor();
-      if (ccd != null) {
-        setCollection(ccd.getCollectionName());
-        setShard(ccd.getShardId());
-        setReplica(cd.getName());
-      }
-    }
-  }
-
-  /**
-   * Call this after {@link #setCore(SolrCore)} or {@link #setCoreDescriptor(CoreContainer, CoreDescriptor)} in a
+   * Call this in a
    * finally.
    */
   public static void clear() {
     int used = CALL_DEPTH.get();
-    if (used <= 1) {
+  //  if (used <= 1) {
       CALL_DEPTH.set(0);
-      MDC.remove(COLLECTION_PROP);
       MDC.remove(CORE_NAME_PROP);
-      MDC.remove(REPLICA_PROP);
-      MDC.remove(SHARD_ID_PROP);
-    } else {
-      CALL_DEPTH.set(used - 1);
-    }
+  //  } else {
+   //   CALL_DEPTH.set(used - 1);
+  //  }
   }
   
   private static void removeAll() {
diff --git a/solr/solrj/src/java/org/apache/zookeeper/ZooKeeperExposed.java b/solr/solrj/src/java/org/apache/zookeeper/ZooKeeperExposed.java
index 01d144c..e890259 100644
--- a/solr/solrj/src/java/org/apache/zookeeper/ZooKeeperExposed.java
+++ b/solr/solrj/src/java/org/apache/zookeeper/ZooKeeperExposed.java
@@ -64,7 +64,7 @@ public class ZooKeeperExposed {
         clientCnxn.sendThread.close();
 
         try {
-            clientCnxn.sendThread.join(20);
+            clientCnxn.sendThread.join(50);
         } catch (InterruptedException e) {
         }
 
diff --git a/solr/test-framework/src/java/org/apache/solr/SolrTestCase.java b/solr/test-framework/src/java/org/apache/solr/SolrTestCase.java
index efa6f0c..647ccb4 100644
--- a/solr/test-framework/src/java/org/apache/solr/SolrTestCase.java
+++ b/solr/test-framework/src/java/org/apache/solr/SolrTestCase.java
@@ -314,6 +314,7 @@ public class SolrTestCase extends LuceneTestCase {
 
       // can make things quite slow
       System.setProperty("solr.disableDefaultJmxReporter", "true");
+
       System.setProperty("solr.skipCommitOnClose", "false");
 
       // can generate tons of URL garbage and can happen too often, defaults to false now anyway
diff --git a/solr/test-framework/src/resources/logconf/log4j2-startup-debug.xml b/solr/test-framework/src/resources/logconf/log4j2-startup-debug.xml
index 22e1955..9cd51b16 100644
--- a/solr/test-framework/src/resources/logconf/log4j2-startup-debug.xml
+++ b/solr/test-framework/src/resources/logconf/log4j2-startup-debug.xml
@@ -20,11 +20,11 @@
     <Appenders>
 
         <Console name="STDERR_COLOR" target="SYSTEM_ERR">
-            <PatternLayout pattern="%style{%-4r}{yellow} %highlight{%maxLen{%-5p}{6}} [%style{%X{replica} %X{trace_id}}{cyan}] %style{%c{1.}}{cyan} %highlight{%m %notEmpty{%ex}}\n"/>
+            <PatternLayout pattern="%style{%-4r}{yellow} %highlight{%maxLen{%-5p}{6}} [%style{%X{core}%X{node_name}}{cyan}] %style{%c{1.}}{cyan} %highlight{%m %notEmpty{%ex}}\n"/>
         </Console>
 
         <File name="FILE" fileName="${sys:user.home}/solr-test.log" immediateFlush="false" append="false">
-            <PatternLayout pattern="%style{%-4r}{yellow} %highlight{%maxLen{%-5p}{6}} %style{(%t)}{yellow,bold} [%style{%X{replica} %X{trace_id}}{cyan}] %style{%c{1.}}{cyan} %highlight{%m %notEmpty{%ex}}\n"/>
+            <PatternLayout pattern="%style{%-4r}{yellow} %highlight{%maxLen{%-5p}{6}} %style{(%t)}{yellow,bold} [%style{%X{core}%X{node_name}}{cyan}] %style{%c{1.}}{cyan} %highlight{%m %notEmpty{%ex}}\n"/>
         </File>
 
         <File name="FILE2" fileName="${sys:user.home}/solr-test.log" immediateFlush="false" append="false">
@@ -39,10 +39,10 @@
 
     </Appenders>
     <Loggers>
-
-
         <AsyncLogger name="org.apache.solr.servlet.HttpSolrCall" level="DEBUG"/>
         <AsyncLogger name="org.apache.zookeeper" level="WARN"/>
+        <AsyncLogger name="org.apache.zookeeper.ClientCnxn" level="ERROR"/>
+        <AsyncLogger name="org.apache.zookeeper.server.ZooKeeperCriticalThread" level="OFF"/>
         <AsyncLogger name="org.apache.hadoop" level="WARN"/>
         <AsyncLogger name="org.apache.directory" level="WARN"/>
         <AsyncLogger name="org.apache.solr.hadoop" level="INFO"/>
diff --git a/solr/test-framework/src/resources/logconf/log4j2-std-debug.xml b/solr/test-framework/src/resources/logconf/log4j2-std-debug.xml
index b4cbf94..8909f02 100644
--- a/solr/test-framework/src/resources/logconf/log4j2-std-debug.xml
+++ b/solr/test-framework/src/resources/logconf/log4j2-std-debug.xml
@@ -20,16 +20,19 @@
     <Appenders>
 
         <Console name="STDERR_COLOR" target="SYSTEM_ERR">
-            <PatternLayout pattern="%style{%-4r}{yellow} %highlight{%maxLen{%-5p}{6}} %style{(%t)}{yellow,bold} [%style{%X{node_name} %X{collection} %X{shard} %X{replica} %X{core} %X{trace_id}}{cyan}] %style{%c{1.}}{cyan} %highlight{%m %notEmpty{%ex}}\n"/>
+            <PatternLayout pattern="%style{%-4r}{yellow} %highlight{%maxLen{%-5p}{6}} %style{(%maxLen{%t}{8})}{yellow,bold} [%style{%X{node_name} %X{core}}{cyan}] %style{%c{1.}}{cyan} %highlight{%m %notEmpty{%ex}}\n"/>
         </Console>
 
         <File name="FILE" fileName="${sys:user.home}/solr-test.log" immediateFlush="false" append="false">
-            <PatternLayout pattern="%style{%-4r}{yellow} %highlight{%maxLen{%-5p}{6}} %style{(%t)}{yellow,bold} [%style{%X{node_name} %X{collection} %X{shard} %X{replica} %X{core} %X{trace_id}}{cyan}] %style{%c{1.}}{cyan} %highlight{%m %notEmpty{%ex}}\n"/>
+            <PatternLayout pattern="%style{%-4r}{yellow} %highlight{%maxLen{%-5p}{6}} %style{(%t)}{yellow,bold} [%style{%X{node_name} %X{core}}{cyan}] %style{%c{1.}}{cyan} %highlight{%m %notEmpty{%ex}}\n"/>
         </File>
 
     </Appenders>
     <Loggers>
         <AsyncLogger name="org.apache.zookeeper" level="WARN"/>
+        <AsyncLogger name="org.apache.zookeeper.ClientCnxn" level="ERROR"/>
+        <AsyncLogger name="org.apache.zookeeper.server.ZooKeeperCriticalThread" level="OFF"/>
+
         <AsyncLogger name="org.apache.hadoop" level="WARN"/>
         <AsyncLogger name="org.apache.directory" level="WARN"/>
         <AsyncLogger name="org.apache.solr.hadoop" level="INFO"/>
@@ -51,6 +54,8 @@
         <AsyncLogger name="org.apache.solr.update.SolrCmdDistributor" level="DEBUG"/>
         <AsyncLogger name="org.apache.solr.update.processor.LogUpdateProcessorFactory" level="DEBUG"/>
 
+        <AsyncLogger name="org.apache.solr.common.ParWork" level="DEBUG"/>
+
 
         <AsyncLogger name="com.google.inject.servlet" level="DEBUG"/>
         <AsyncLogger name="org.apache.solr.client.solrj.impl.Http2SolrClient" level="DEBUG"/>