You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2021/01/24 02:18:07 UTC

[lucene-solr] branch reference_impl_dev updated (0f23e09 -> 4a4e36a)

This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a change to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git.


    from 0f23e09  @1276 Don't interrupt election and SolrQos tweaks and recovery tweaks.
     new 960063a  @1277 Don't keep going on bulk update request on a 404.
     new 27bb506  @1278 Explicit truncation cast.
     new 7a85375  @1279 Core replacing only if using shared storage.
     new ad55ecb  @1280 Election tweaks for shard leader.
     new 65b1ef8  @1281 Clean up some MDC core logging.
     new 5351ab3  @1282 Fix long to int for real.
     new 84f1dff  @1283 Don't cancel recovery.
     new 57e1627  @1284 Allow ACTIVE as well again.
     new 1f2b383  @1285 Cleanup
     new 2830a37  @1286 Better error message.
     new 94dbe77  @1287 Tweak prep recovery cmd.
     new 34cc966  @1288 Only check state in prep recovery and don't try to unload cores that look to have moved.
     new 6884bde  @1289 Allow getting the leader before being live.
     new f1e3015  wip
     new 02ec26f  wip
     new 4a4e36a  wip

The 16 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../apache/lucene/store/NRTCachingDirectory.java   |   4 +-
 solr/bin/solr                                      |   2 +-
 .../src/java/org/apache/solr/cloud/CloudUtil.java  |  19 --
 .../java/org/apache/solr/cloud/LeaderElector.java  | 254 ++++++++-------
 .../src/java/org/apache/solr/cloud/Overseer.java   |  27 +-
 .../apache/solr/cloud/OverseerElectionContext.java |   3 +-
 .../org/apache/solr/cloud/OverseerTaskQueue.java   |   2 +
 .../solr/cloud/RecoveringCoreTermWatcher.java      |  10 +-
 .../org/apache/solr/cloud/RecoveryStrategy.java    |  76 +++--
 .../solr/cloud/ShardLeaderElectionContext.java     |  62 ++--
 .../solr/cloud/ShardLeaderElectionContextBase.java |   8 +
 .../java/org/apache/solr/cloud/ZkController.java   | 128 +++-----
 .../java/org/apache/solr/cloud/ZkShardTerms.java   |  42 +--
 .../apache/solr/cloud/overseer/ZkStateWriter.java  | 218 +++++++------
 .../apache/solr/core/CachingDirectoryFactory.java  |   2 +-
 .../java/org/apache/solr/core/CoreContainer.java   | 345 ++++++++++-----------
 .../apache/solr/core/CorePropertiesLocator.java    |   4 +-
 .../src/java/org/apache/solr/core/SolrCore.java    |  24 +-
 .../src/java/org/apache/solr/core/SolrCores.java   |   2 +-
 .../java/org/apache/solr/handler/IndexFetcher.java |  24 +-
 .../solr/handler/admin/CoreAdminHandler.java       |   1 +
 .../solr/handler/admin/CoreAdminOperation.java     |  59 ++--
 .../apache/solr/handler/admin/PrepRecoveryOp.java  |  11 +-
 .../solr/handler/component/HttpShardHandler.java   |   2 +-
 .../handler/component/RealTimeGetComponent.java    |   5 +-
 .../apache/solr/schema/ZkIndexSchemaReader.java    |   2 +
 .../org/apache/solr/servlet/SolrQoSFilter.java     |  86 ++---
 .../org/apache/solr/update/AddUpdateCommand.java   |  10 +-
 .../apache/solr/update/DefaultSolrCoreState.java   |  12 +-
 .../src/java/org/apache/solr/update/PeerSync.java  | 102 +++---
 .../org/apache/solr/update/PeerSyncWithLeader.java |  18 +-
 .../org/apache/solr/update/SolrCmdDistributor.java |  24 +-
 .../java/org/apache/solr/update/SolrCoreState.java |   4 +-
 .../org/apache/solr/update/SolrIndexSplitter.java  |   2 +-
 .../org/apache/solr/update/SolrIndexWriter.java    |   8 +-
 .../src/java/org/apache/solr/update/UpdateLog.java | 109 ++++---
 .../processor/DistributedZkUpdateProcessor.java    |  10 +-
 .../solr/cloud/MissingSegmentRecoveryTest.java     |   2 +
 .../CollectionsAPIDistributedZkTest.java           |   1 +
 .../apache/solr/update/MockingHttp2SolrClient.java |   2 +-
 solr/server/etc/jetty-https.xml                    |   2 +-
 solr/server/etc/jetty-https8.xml                   |   5 +
 solr/server/etc/jetty.xml                          |   4 -
 solr/server/resources/log4j2.xml                   |   4 +-
 .../client/solrj/impl/CloudHttp2SolrClient.java    |   2 +-
 .../solr/client/solrj/impl/Http2SolrClient.java    |  22 +-
 .../solr/client/solrj/impl/LBHttp2SolrClient.java  |   8 +-
 .../solr/client/solrj/util/AsyncListener.java      |   2 +-
 .../src/java/org/apache/solr/common/ParWork.java   |  16 +-
 .../apache/solr/common/PerThreadExecService.java   |  20 +-
 .../org/apache/solr/common/SolrInputDocument.java  |   3 +-
 .../apache/solr/common/cloud/SolrZooKeeper.java    |   6 +-
 .../apache/solr/common/cloud/ZkStateReader.java    | 259 ++++++++--------
 .../java/org/apache/solr/common/util/SysStats.java |  23 +-
 .../org/apache/zookeeper/ZooKeeperExposed.java     |   2 +-
 55 files changed, 1061 insertions(+), 1043 deletions(-)


[lucene-solr] 04/16: @1280 Election tweaks for shard leader.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit ad55ecb004b32d4e99ed02cb9c392d760efcb3b2
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jan 21 20:37:49 2021 -0600

    @1280 Election tweaks for shard leader.
---
 .../java/org/apache/solr/cloud/LeaderElector.java  | 14 +++++-----
 .../solr/cloud/RecoveringCoreTermWatcher.java      |  4 +++
 .../solr/cloud/ShardLeaderElectionContext.java     | 11 +++++---
 .../solr/cloud/ShardLeaderElectionContextBase.java |  6 +++++
 .../java/org/apache/solr/cloud/ZkController.java   | 30 +++++++++-------------
 5 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
index fcbdd4b..1e6dc35 100644
--- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
+++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
@@ -447,11 +447,14 @@ public class LeaderElector implements Closeable {
   }
 
   @Override
-  public void close() throws IOException {
+  public synchronized void close() throws IOException {
     assert ObjectReleaseTracker.release(this);
     state = CLOSED;
     this.isClosed = true;
+
     IOUtils.closeQuietly(watcher);
+    watcher = null;
+
     if (context != null) {
       try {
         context.cancelElection();
@@ -461,9 +464,9 @@ public class LeaderElector implements Closeable {
     }
     try {
       if (joinFuture != null) {
-        joinFuture.cancel(false);
+        joinFuture.get();
       }
-    } catch (NullPointerException e) {
+    } catch (Exception e) {
       // okay
     }
   }
@@ -588,9 +591,7 @@ public class LeaderElector implements Closeable {
       log.info("Closed, won't rejoin election");
       throw new AlreadyClosedException();
     }
-    ElectionWatcher watcher = this.watcher;
-    IOUtils.closeQuietly(watcher);
-    this.watcher = null;
+
     IOUtils.closeQuietly(this);
     context.leaderSeqPath = null;
     context.watchedSeqPath = null;
@@ -602,6 +603,7 @@ public class LeaderElector implements Closeable {
     isClosed = false;
     isCancelled = false;
     joinFuture = null;
+    state = OUT_OF_ELECTION;
     joinElection(true, joinAtHead);
   }
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveringCoreTermWatcher.java b/solr/core/src/java/org/apache/solr/cloud/RecoveringCoreTermWatcher.java
index 7545282..5416b89 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveringCoreTermWatcher.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveringCoreTermWatcher.java
@@ -60,6 +60,10 @@ public class RecoveringCoreTermWatcher implements ZkShardTerms.CoreTermWatcher,
       if (terms.getTerm(coreName) != null && lastTermDoRecovery.get() < terms.getTerm(coreName)) {
         log.info("Start recovery on {} because core's term is less than leader's term", coreName);
         lastTermDoRecovery.set(terms.getTerm(coreName));
+        LeaderElector leaderElector = coreContainer.getZkController().getLeaderElector(coreName);
+        if (leaderElector != null) {
+          leaderElector.retryElection(false);
+        }
         try (SolrCore solrCore = coreContainer.getCore(coreDescriptor.getName())) {
           solrCore.getUpdateHandler().getSolrCoreState().doRecovery(solrCore.getCoreContainer(), solrCore.getCoreDescriptor());
         }
diff --git a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
index dfd970a..feabe53 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
@@ -109,6 +109,9 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
         return false;
       }
       try {
+
+        core.getSolrCoreState().cancelRecovery(true, false);
+
         ActionThrottle lt;
 
         MDCLoggingContext.setCore(core);
@@ -188,11 +191,11 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
             // - we were active
             // before, so become leader anyway if no one else has any versions either
             if (result.getOtherHasVersions().orElse(false)) {
-              log.info("We failed sync, but we have no versions - we can't sync in that case. But others have some versions, so we should not become leader");
+              log.info("We failed sync, but we have no versions - we can't sync in that case. But others have some versions, so we should not become leader {}", coreName);
               rejoinLeaderElection(core);
               return false;
             } else {
-              log.info("We failed sync, but we have no versions - we can't sync in that case - we did not find versions on other replicas, so become leader anyway");
+              log.info("We failed sync, but we have no versions - we can't sync in that case - we did not find versions on other replicas, so become leader anyway {}", coreName);
               success = true;
             }
           }
@@ -215,7 +218,7 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
 //          }
 //        }
         if (!success) {
-          log.info("Sync with potential leader failed, rejoining election ...");
+          log.info("Sync with potential leader failed, rejoining election {} ...", coreName);
           rejoinLeaderElection(core);
           return false;
         }
@@ -329,7 +332,7 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
           throws InterruptedException, KeeperException, IOException {
     // remove our ephemeral and re join the election
 
-    log.info("There may be a better leader candidate than us - will cancel election, rejoin election, and kick off recovery");
+    log.info("There may be a better leader candidate than us - will cancel election, rejoin election, and kick off recovery {}", core.getName());
 
     leaderElector.retryElection(false);
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
index 1754d50..1a3ac74 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
@@ -150,6 +150,12 @@ class ShardLeaderElectionContextBase extends ElectionContext {
         if (log.isDebugEnabled()) log.debug("No version found for ephemeral leader parent node, won't remove previous leader registration. {} {}", leaderPath, leaderSeqPath);
       }
     } catch (Exception e) {
+
+      if (leaderSeqPath != null) {
+        if (log.isDebugEnabled()) log.debug("Delete leader seq election path {} path we watch is {}", leaderSeqPath, watchedSeqPath);
+        zkClient.delete(leaderSeqPath, -1);
+      }
+
       if (e instanceof InterruptedException) {
         ParWork.propagateInterrupt(e);
       }
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index c4e9db6..1f4ecad 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -183,6 +183,11 @@ public class ZkController implements Closeable, Runnable {
     return  elector;
   }
 
+  public LeaderElector getLeaderElector(String name) {
+    LeaderElector elector = leaderElectors.get(name);
+    return elector;
+  }
+
   static class ContextKey {
 
     private String collection;
@@ -2084,24 +2089,13 @@ public class ZkController implements Closeable, Runnable {
       MDCLoggingContext.setCoreDescriptor(cc, cc.getCoreDescriptor(coreName));
 
       log.info("Rejoin the shard leader election.");
-
-      ContextKey contextKey = new ContextKey(collectionName, coreName);
-
-
-      Map<String, Object> props = new HashMap<>();
-      props.put(ZkStateReader.NODE_NAME_PROP, getNodeName());
-
-      Replica replica = new Replica(coreName, props, collectionName, shardId, zkStateReader);
-
-      LeaderElector elect =  leaderElectors.get(replica.getName());
-
-      ShardLeaderElectionContext context = new ShardLeaderElectionContext(elect, shardId, collectionName,
-          coreName, replica, this, getCoreContainer());
-
-      context.leaderSeqPath = context.electionPath + LeaderElector.ELECTION_NODE + "/" + electionNode;
-      elect.setup(context);
-
-      elect.retryElection(params.getBool(REJOIN_AT_HEAD_PROP, false));
+      LeaderElector elect =  leaderElectors.get(coreName);
+      if (elect != null) {
+        elect.retryElection(params.getBool(REJOIN_AT_HEAD_PROP, false));
+      }
+      try (SolrCore core = getCoreContainer().getCore(coreName)) {
+        core.getSolrCoreState().doRecovery(core);
+      }
     } catch (Exception e) {
       ParWork.propagateInterrupt(e);
       throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to rejoin election", e);


[lucene-solr] 02/16: @1278 Explicit truncation cast.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 27bb5065fe8b42e93bd2cf45d652a1c729c5e161
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jan 21 19:39:13 2021 -0600

    @1278 Explicit truncation cast.
---
 solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java b/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java
index 84b3c04..c8f120e 100644
--- a/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java
+++ b/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java
@@ -87,7 +87,7 @@ public class SolrQoSFilter extends QoSFilter {
             if (log.isDebugEnabled()) log.debug("set max concurrent requests to orig value {}", _origMaxRequests);
             updateMaxRequests(_origMaxRequests, sLoad, ourLoad);
           } else {
-            updateMaxRequests(Math.min(_origMaxRequests, Math.round(getMaxRequests() * 3)), sLoad, ourLoad);
+            updateMaxRequests(Math.min(_origMaxRequests, (int) Math.round(getMaxRequests() * 3)), sLoad, ourLoad);
           }
         } else {
           if (ourLoad > 0.90 && sLoad > 1.5) {
@@ -102,7 +102,7 @@ public class SolrQoSFilter extends QoSFilter {
               if (log.isDebugEnabled()) log.debug("set max concurrent requests to orig value {}", _origMaxRequests);
               updateMaxRequests(_origMaxRequests, sLoad, ourLoad);
             } else {
-              updateMaxRequests(Math.min(_origMaxRequests, Math.round(getMaxRequests() * 3)), sLoad, ourLoad);
+              updateMaxRequests(Math.min(_origMaxRequests, (int) Math.round(getMaxRequests() * 3)), sLoad, ourLoad);
             }
 
           }


[lucene-solr] 15/16: wip

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 02ec26f9273646ff2eb19d6669c062ed4c94ddfb
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Fri Jan 22 21:21:56 2021 -0600

    wip
---
 .../java/org/apache/solr/cloud/LeaderElector.java  |   6 +-
 .../apache/solr/cloud/OverseerElectionContext.java |   3 +-
 .../solr/cloud/ShardLeaderElectionContextBase.java |   2 +
 .../apache/solr/common/cloud/ZkStateReader.java    | 169 ++++++++-------------
 4 files changed, 69 insertions(+), 111 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
index ac2cbe9..21b2a41 100644
--- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
+++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
@@ -237,8 +237,10 @@ public class LeaderElector implements Closeable {
             log.warn("Failed setting election watch, retrying {} {}", e.getClass().getName(), e.getMessage());
             state = OUT_OF_ELECTION;
             return true;
-          } catch (Exception e) {
+          } catch (AlreadyClosedException e) {
             state = OUT_OF_ELECTION;
+            return false;
+          } catch (Exception e) {
             // we couldn't set our watch for some other reason, retry
             log.error("Failed setting election watch {} {}", e.getClass().getName(), e.getMessage());
             state = OUT_OF_ELECTION;
@@ -252,7 +254,7 @@ public class LeaderElector implements Closeable {
         return true;
       } catch (AlreadyClosedException e) {
         state = OUT_OF_ELECTION;
-        return true;
+        return false;
       } catch (Exception e) {
         state = OUT_OF_ELECTION;
         return true;
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
index 43feae3..4de358e 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
@@ -17,6 +17,7 @@
 
 package org.apache.solr.cloud;
 
+import org.apache.solr.common.AlreadyClosedException;
 import org.apache.solr.common.ParWork;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.SolrZkClient;
@@ -56,7 +57,7 @@ final class OverseerElectionContext extends ShardLeaderElectionContextBase {
 
     if (overseer.isDone()) {
       log.info("Already closed, bailing ...");
-      return false;
+      throw new AlreadyClosedException();
     }
 
     // TODO: the idea here is that we could clear the Overseer queue
diff --git a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
index 1a3ac74..d27db60 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
@@ -218,6 +218,8 @@ class ShardLeaderElectionContextBase extends ElectionContext {
       zkClient.delete(leaderPath, -1);
 
       runLeaderProcess(context, weAreReplacement, pauseBeforeStartMs);
+    } catch (AlreadyClosedException e) {
+      throw e;
     } catch (Throwable t) {
       log.warn("Could not register as the leader because creating the ephemeral registration node in ZooKeeper failed: ", t);
       throw new SolrException(ErrorCode.SERVER_ERROR, "Could not register as the leader because creating the ephemeral registration node in ZooKeeper failed: " + errors, t);
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
index 0e3a685..f4e7dd6 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
@@ -211,8 +211,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
   private final ConcurrentHashMap<String, CollectionWatch<DocCollectionWatcher>> collectionWatches = new ConcurrentHashMap<>(32, 0.75f, 3);
 
-  private final ConcurrentHashMap<String, ReentrantLock> collectionLocks = new ConcurrentHashMap<>(32, 0.75f, 3);
-
   private final Map<String,StateWatcher> stateWatchersMap = new ConcurrentHashMap<>(32, 0.75f, 3);
 
   // named this observers so there's less confusion between CollectionPropsWatcher map and the PropsWatcher map.
@@ -369,15 +367,9 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       Collection<String> safeCopy = new ArrayList<>(watchedCollectionStates.keySet());
       Set<String> updatedCollections = new HashSet<>();
       for (String coll : safeCopy) {
-        ReentrantLock lock = collectionLocks.get(coll);
-        if (lock != null) lock.lock();
-        try {
-          DocCollection newState = fetchCollectionState(coll, null);
-          if (updateWatchedCollection(coll, newState)) {
-            updatedCollections.add(coll);
-          }
-        } finally {
-          if (lock != null) lock.unlock();
+        DocCollection newState = fetchCollectionState(coll, null);
+        if (updateWatchedCollection(coll, newState)) {
+          updatedCollections.add(coll);
         }
       }
       constructState(updatedCollections);
@@ -393,8 +385,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
 
   public void forciblyRefreshClusterStateSlow(String name) {
-    ReentrantLock lock = collectionLocks.get(name);
-    if (lock != null) lock.lock();
     try {
       refreshCollectionList(null);
       refreshLiveNodes(null);
@@ -416,8 +406,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
     } catch (InterruptedException e) {
       ParWork.propagateInterrupt(e);
       throw new SolrException(ErrorCode.SERVER_ERROR, e);
-    } finally {
-      if (lock != null) lock.unlock();
     }
   }
 
@@ -429,35 +417,28 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
   }
 
   public Integer compareStateVersions(String coll, int version) {
-    DocCollection collection = null;
-    ReentrantLock lock = collectionLocks.get(coll);
-    if (lock != null) lock.lock();
-    try {
-      collection = clusterState.getCollectionOrNull(coll);
-      if (collection == null) return null;
-      if (collection.getZNodeVersion() < version) {
-        if (log.isDebugEnabled()) {
-          log.debug("Server older than client {}<{}", collection.getZNodeVersion(), version);
-        }
-        DocCollection nu = getCollectionLive(this, coll);
-        if (nu == null) return -3;
-        if (nu.getZNodeVersion() > collection.getZNodeVersion()) {
-          if (updateWatchedCollection(coll, nu)) {
-            constructState(Collections.singleton(coll));
-          }
-          collection = nu;
+    DocCollection collection = clusterState.getCollectionOrNull(coll);
+    if (collection == null) return null;
+    if (collection.getZNodeVersion() < version) {
+      if (log.isDebugEnabled()) {
+        log.debug("Server older than client {}<{}", collection.getZNodeVersion(), version);
+      }
+      DocCollection nu = getCollectionLive(this, coll);
+      if (nu == null) return -3;
+      if (nu.getZNodeVersion() > collection.getZNodeVersion()) {
+        if (updateWatchedCollection(coll, nu)) {
+          constructState(Collections.singleton(coll));
         }
+        collection = nu;
       }
+    }
 
-      if (collection.getZNodeVersion() == version) {
-        return null;
-      }
+    if (collection.getZNodeVersion() == version) {
+      return null;
+    }
 
-      if (log.isDebugEnabled()) {
-        log.debug("Wrong version from client [{}]!=[{}]", version, collection.getZNodeVersion());
-      }
-    } finally {
-      if (lock != null) lock.unlock();
+    if (log.isDebugEnabled()) {
+      log.debug("Wrong version from client [{}]!=[{}]", version, collection.getZNodeVersion());
     }
 
     return collection.getZNodeVersion();
@@ -1388,26 +1369,19 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
       if (closed) return;
 
-      ReentrantLock lock = collectionLocks.get(coll);
-      if (lock != null) lock.lock();
-      try {
-//        if (!collectionWatches.containsKey(coll)) {
-//          // This collection is no longer interesting, stop watching.
-//          log.debug("Uninteresting collection {}", coll);
-//          return;
-//        }
-
-        Set<String> liveNodes = ZkStateReader.this.liveNodes;
-        if (log.isInfoEnabled()) {
-          log.info("A cluster state change: [{}] for collection [{}] has occurred - updating... (live nodes size: [{}])", event, coll, liveNodes.size());
-        }
-
-        refreshAndWatch();
+      if (!collectionWatches.containsKey(coll)) {
+        // This collection is no longer interesting, stop watching.
+        log.debug("Uninteresting collection {}", coll);
+        return;
+      }
 
-      } finally {
-        if (lock != null) lock.unlock();
+      Set<String> liveNodes = ZkStateReader.this.liveNodes;
+      if (log.isInfoEnabled()) {
+        log.info("A cluster state change: [{}] for collection [{}] has occurred - updating... (live nodes size: [{}])", event, coll, liveNodes.size());
       }
 
+      refreshAndWatch();
+
     }
 
     /**
@@ -1445,8 +1419,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
           }
           if (log.isDebugEnabled()) log.debug("_statupdates event {}", event);
 
-          ReentrantLock lock = collectionLocks.get(coll);
-          if (lock != null) lock.lock();
           try {
 
             //            if (event.getType() == EventType.NodeDataChanged ||
@@ -1456,8 +1428,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
           } catch (Exception e) {
             log.error("Unwatched collection: [{}]", coll, e);
-          } finally {
-            if (lock != null) lock.unlock();
           }
         }
 
@@ -1902,8 +1872,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
     if (reconstructState.get()) {
       StateWatcher sw = new StateWatcher(collection);
       stateWatchersMap.put(collection, sw);
-      ReentrantLock lock = new ReentrantLock(true);
-      collectionLocks.put(collection, lock);
       sw.refreshAndWatch();
       sw.watchStateUpdates();
     }
@@ -1929,29 +1897,24 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       throw new IllegalArgumentException("Collection cannot be null");
     }
     AtomicBoolean reconstructState = new AtomicBoolean(false);
-    ReentrantLock lock = collectionLocks.get(collection);
-    if (lock != null) lock.lock();
-    try {
-      collectionWatches.compute(collection, (k, v) -> {
-        if (v == null) return null;
-        v.coreRefCount.decrementAndGet();
-        if (v.canBeRemoved()) {
-          watchedCollectionStates.remove(collection);
-          collectionLocks.remove(collection);
-          IOUtils.closeQuietly(stateWatchersMap.remove(collection));
-          lazyCollectionStates.put(collection, new LazyCollectionRef(collection));
-          reconstructState.set(true);
-          return null;
-        }
-        return v;
-      });
 
-      if (reconstructState.get()) {
-        constructState(Collections.emptySet());
+    collectionWatches.compute(collection, (k, v) -> {
+      if (v == null) return null;
+      v.coreRefCount.decrementAndGet();
+      if (v.canBeRemoved()) {
+        watchedCollectionStates.remove(collection);
+        IOUtils.closeQuietly(stateWatchersMap.remove(collection));
+        lazyCollectionStates.put(collection, new LazyCollectionRef(collection));
+        reconstructState.set(true);
+        return null;
       }
-    } finally {
-      if (lock != null) lock.unlock();
+      return v;
+    });
+
+    if (reconstructState.get()) {
+      constructState(Collections.emptySet());
     }
+
   }
 
   /**
@@ -2015,8 +1978,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
     if (watchSet.get()) {
       StateWatcher sw = new StateWatcher(collection);
       stateWatchersMap.put(collection, sw);
-      ReentrantLock lock = new ReentrantLock(true);
-      collectionLocks.put(collection, lock);
       sw.refreshAndWatch();
       sw.watchStateUpdates();
     }
@@ -2198,30 +2159,23 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
     AtomicBoolean reconstructState = new AtomicBoolean(false);
 
-    ReentrantLock lock = collectionLocks.get(collection);
-    if (lock != null) lock.lock();
-    try {
-      collectionWatches.compute(collection, (k, v) -> {
-        if (v == null) return null;
-        v.stateWatchers.remove(watcher);
-        if (v.canBeRemoved()) {
-          log.info("no longer watch collection {}", collection);
-          watchedCollectionStates.remove(collection);
-          lazyCollectionStates.put(collection, new LazyCollectionRef(collection));
-          collectionLocks.remove(collection);
-          StateWatcher stateWatcher = stateWatchersMap.remove(collection);
-          if (stateWatcher != null) {
-            IOUtils.closeQuietly(stateWatcher);
-          }
-          reconstructState.set(true);
-          return null;
+    collectionWatches.compute(collection, (k, v) -> {
+      if (v == null) return null;
+      v.stateWatchers.remove(watcher);
+      if (v.canBeRemoved()) {
+        log.info("no longer watch collection {}", collection);
+        watchedCollectionStates.remove(collection);
+        lazyCollectionStates.put(collection, new LazyCollectionRef(collection));
+        StateWatcher stateWatcher = stateWatchersMap.remove(collection);
+        if (stateWatcher != null) {
+          IOUtils.closeQuietly(stateWatcher);
         }
-        return v;
-      });
+        reconstructState.set(true);
+        return null;
+      }
+      return v;
+    });
 
-    } finally {
-      if (lock != null) lock.unlock();
-    }
   }
 
   /* package-private for testing */
@@ -2247,7 +2201,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       if (newState == null) {
         if (log.isDebugEnabled()) log.debug("Removing cached collection state for [{}]", coll);
         watchedCollectionStates.remove(coll);
-        collectionLocks.remove(coll);
         IOUtils.closeQuietly(stateWatchersMap.remove(coll));
         lazyCollectionStates.remove(coll);
         if (collectionRemoved != null) {


[lucene-solr] 13/16: @1289 Allow getting the leader before being live.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 6884bde5845af3968b685347bba88a6d6cfdf3b5
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jan 21 23:35:03 2021 -0600

    @1289 Allow getting the leader before being live.
---
 .../org/apache/solr/cloud/RecoveryStrategy.java    |  5 ++-
 .../java/org/apache/solr/cloud/ZkController.java   |  5 +--
 .../java/org/apache/solr/handler/IndexFetcher.java |  2 +-
 .../processor/DistributedZkUpdateProcessor.java    | 10 ++---
 .../apache/solr/common/cloud/ZkStateReader.java    | 52 ++++++++++++++++++----
 5 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index ff6b669..eb377d6 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -350,7 +350,8 @@ public class RecoveryStrategy implements Runnable, Closeable {
           // expected
         }
 
-        Replica leader = zkController.getZkStateReader().getLeaderRetry(coreDescriptor.getCollectionName(), coreDescriptor.getCloudDescriptor().getShardId(), 1500);
+        Replica leader = zkController.getZkStateReader().getLeaderRetry(coreDescriptor.getCollectionName(), coreDescriptor.getCloudDescriptor().getShardId(), 1500, false);
+
         if (leader != null && leader.getName().equals(coreName)) {
           log.info("We are the leader, STOP recovery");
           close = true;
@@ -404,7 +405,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
         CloudDescriptor cloudDesc = this.coreDescriptor.getCloudDescriptor();
         Replica leader;
         try {
-          leader = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId(), 1500);
+          leader = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId(), 1500, false);
 
           if (leader != null && leader.getName().equals(coreName)) {
             log.info("We are the leader, STOP recovery");
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index a5a6b19..f146d1b 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -1433,11 +1433,8 @@ public class ZkController implements Closeable, Runnable {
             throw new AlreadyClosedException();
           }
 
-          leader = zkStateReader.getLeaderRetry(collection, shardId, 500);
+          leader = zkStateReader.getLeaderRetry(collection, shardId, 500, false);
 
-          if (zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + shardId  + "/leader")) {
-            break;
-          }
         } catch (TimeoutException timeoutException) {
 
         }
diff --git a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
index 1419340..6ac1139 100644
--- a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
+++ b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
@@ -726,7 +726,7 @@ public class IndexFetcher {
     ZkController zkController = solrCore.getCoreContainer().getZkController();
     CloudDescriptor cd = solrCore.getCoreDescriptor().getCloudDescriptor();
     Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(
-        cd.getCollectionName(), cd.getShardId());
+        cd.getCollectionName(), cd.getShardId(), 1500, false);
     return leaderReplica;
   }
 
diff --git a/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
index 7e8faeb..a5a8847 100644
--- a/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
+++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
@@ -190,7 +190,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
             EnumSet.of(Replica.Type.TLOG, Replica.Type.NRT), true);
 
         try {
-          leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId(), 1000);
+          leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId(), 1000, false);
         } catch (Exception e) {
           ParWork.propagateInterrupt(e);
           throw new SolrException(ErrorCode.SERVER_ERROR,
@@ -645,7 +645,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
           + "failed since we're not in cloud mode.");
     }
     try {
-      return zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId()).getCoreUrl();
+      return zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId(), 1500, false).getCoreUrl();
     } catch (InterruptedException | TimeoutException e) {
       ParWork.propagateInterrupt(e);
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Exception during fetching from leader.", e);
@@ -717,14 +717,14 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
     try {
       // Not equivalent to getLeaderProps, which  retries to find a leader.
       // Replica leader = slice.getLeader();
-      Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 100);
+      Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 100, false);
       isLeader = leaderReplica.getName().equals(desc.getName());
       if (log.isTraceEnabled()) log.trace("Are we leader for sending to replicas? {} phase={}", isLeader, phase);
       if (!isLeader) {
         isSubShardLeader = amISubShardLeader(coll, slice, id, doc);
         if (isSubShardLeader) {
           shardId = cloudDesc.getShardId();
-          leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId);
+          leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 1500, false);
         }
       }
 
@@ -891,7 +891,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
     Slice mySlice = coll.getSlice(myShardId);
     final Slice.State state = mySlice.getState();
     if (state == Slice.State.CONSTRUCTION || state == Slice.State.RECOVERY) {
-      Replica myLeader = zkController.getZkStateReader().getLeaderRetry(collection, myShardId);
+      Replica myLeader = zkController.getZkStateReader().getLeaderRetry(collection, myShardId, 1500, false);
       boolean amILeader = myLeader.getName().equals(desc.getName());
       if (amILeader) {
         // Does the document belong to my hash range as well?
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
index a73bb23..48c3329 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
@@ -980,6 +980,13 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
    * Get shard leader properties, with retry if none exist.
    */
   public Replica getLeaderRetry(String collection, String shard, int timeout) throws InterruptedException, TimeoutException {
+    return getLeaderRetry(collection, shard, timeout, true);
+  }
+
+  /**
+   * Get shard leader properties, with retry if none exist.
+   */
+  public Replica getLeaderRetry(String collection, String shard, int timeout, boolean mustBeLive) throws InterruptedException, TimeoutException {
 
     DocCollection coll = getClusterState().getCollectionOrNull(collection);
 
@@ -987,12 +994,20 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       Slice slice = coll.getSlice(shard);
       if (slice != null) {
         Replica leader = slice.getLeader();
-        if (leader != null && leader.getState() == Replica.State.ACTIVE && isNodeLive(leader.getNodeName())) {
+        boolean valid;
+        try {
+          valid = mustBeLive ? isNodeLive(leader.getNodeName()) || zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + slice.getName() + "/leader") : isNodeLive(leader.getNodeName());
+        } catch (KeeperException e) {
+          throw new SolrException(ErrorCode.SERVER_ERROR, e);
+        } catch (InterruptedException e) {
+          throw new SolrException(ErrorCode.SERVER_ERROR, e);
+        }
+        if (leader != null && leader.getState() == Replica.State.ACTIVE && valid) {
           return leader;
         }
         Collection<Replica> replicas = slice.getReplicas();
         for (Replica replica : replicas) {
-          if ("true".equals(replica.getProperty(LEADER_PROP)) && replica.getState() == Replica.State.ACTIVE && isNodeLive(replica.getNodeName())) {
+          if ("true".equals(replica.getProperty(LEADER_PROP)) && replica.getState() == Replica.State.ACTIVE && valid) {
             return replica;
           }
         }
@@ -1007,15 +1022,36 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
         Slice slice = c.getSlice(shard);
         if (slice == null) return false;
         Replica leader = slice.getLeader();
-        if (leader != null && leader.getState() == Replica.State.ACTIVE && isNodeLive(leader.getNodeName())) {
-          returnLeader.set(leader);
-          return true;
+
+        if (leader != null && leader.getState() == Replica.State.ACTIVE) {
+          boolean valid = false;
+          try {
+            valid = mustBeLive ?  isNodeLive(leader.getNodeName()) || zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + slice.getName()  + "/leader") : isNodeLive(leader.getNodeName());
+          } catch (KeeperException e) {
+            throw new SolrException(ErrorCode.SERVER_ERROR, e);
+          } catch (InterruptedException e) {
+            throw new SolrException(ErrorCode.SERVER_ERROR, e);
+          }
+          if (valid) {
+            returnLeader.set(leader);
+            return true;
+          }
         }
         Collection<Replica> replicas = slice.getReplicas();
         for (Replica replica : replicas) {
-          if ("true".equals(replica.getProperty(LEADER_PROP)) && replica.getState() == Replica.State.ACTIVE && isNodeLive(replica.getNodeName())) {
-            returnLeader.set(replica);
-            return true;
+          if ("true".equals(replica.getProperty(LEADER_PROP)) && replica.getState() == Replica.State.ACTIVE) {
+            boolean valid = false;
+            try {
+              valid = mustBeLive ?  zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + slice.getName()  + "/leader") : isNodeLive(leader.getNodeName());
+            } catch (KeeperException e) {
+              throw new SolrException(ErrorCode.SERVER_ERROR, e);
+            } catch (InterruptedException e) {
+              throw new SolrException(ErrorCode.SERVER_ERROR, e);
+            }
+            if (valid) {
+              returnLeader.set(replica);
+              return true;
+            }
           }
         }
 


[lucene-solr] 01/16: @1277 Don't keep going on bulk update request on a 404.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 960063aba921e9e534d97b8ffaef648247992621
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jan 21 19:36:04 2021 -0600

    @1277 Don't keep going on bulk update request on a 404.
---
 .../org/apache/solr/cloud/RecoveryStrategy.java    |  2 +-
 .../java/org/apache/solr/handler/IndexFetcher.java |  2 +-
 .../solr/handler/component/HttpShardHandler.java   |  2 +-
 .../org/apache/solr/update/SolrCmdDistributor.java | 24 ++++++++++++++++++++--
 .../apache/solr/update/MockingHttp2SolrClient.java |  2 +-
 .../client/solrj/impl/CloudHttp2SolrClient.java    |  2 +-
 .../solr/client/solrj/impl/Http2SolrClient.java    | 14 ++++++-------
 .../solr/client/solrj/impl/LBHttp2SolrClient.java  |  8 ++++----
 .../solr/client/solrj/util/AsyncListener.java      |  2 +-
 9 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index 68d99fc..8eb64a9 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -971,7 +971,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
     }
 
     @Override
-    public void onFailure(Throwable throwable) {
+    public void onFailure(Throwable throwable, int code) {
       try {
         latch.countDown();
       } catch (NullPointerException e) {
diff --git a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
index 972d012..1419340 100644
--- a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
+++ b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
@@ -1908,7 +1908,7 @@ public class IndexFetcher {
           }
 
           @Override
-          public void onFailure(Throwable throwable) {
+          public void onFailure(Throwable throwable, int code) {
             log.error("Exception fetching file", throwable);
           }
         });
diff --git a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java
index a36744c..31ae407 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java
@@ -201,7 +201,7 @@ public class HttpShardHandler extends ShardHandler {
         responses.add(srsp);
       }
 
-      public void onFailure(Throwable throwable) {
+      public void onFailure(Throwable throwable, int code) {
         ssr.elapsedTime = TimeUnit.MILLISECONDS.convert(System.nanoTime() - startTime, TimeUnit.NANOSECONDS);
         srsp.setException(throwable);
         if (throwable instanceof SolrException) {
diff --git a/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java b/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
index dc643e8..8ef8913 100644
--- a/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
+++ b/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
@@ -66,6 +66,7 @@ public class SolrCmdDistributor implements Closeable {
   private final Http2SolrClient solrClient;
   private volatile boolean closed;
   private final Set<Cancellable> cancels = ConcurrentHashMap.newKeySet(32);
+  private volatile Throwable cancelExeption;
 
   public SolrCmdDistributor(ZkStateReader zkStateReader, UpdateShardHandler updateShardHandler) {
     assert ObjectReleaseTracker.track(this);
@@ -83,6 +84,13 @@ public class SolrCmdDistributor implements Closeable {
 
   public void finish() {
     assert !finished : "lifecycle sanity check";
+
+    if (cancelExeption != null) {
+      Throwable exp = cancelExeption;
+      cancelExeption = null;
+      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, exp);
+    }
+
     if (isClosed == null || isClosed != null && !isClosed.isClosed()) {
       solrClient.waitForOutstandingRequests();
     } else {
@@ -222,6 +230,12 @@ public class SolrCmdDistributor implements Closeable {
 
   private void submit(final Req req) {
 
+    if (cancelExeption != null) {
+      Throwable exp = cancelExeption;
+      cancelExeption = null;
+      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, exp);
+    }
+
     if (log.isDebugEnabled()) {
       log.debug("sending update to " + req.node.getUrl() + " retry:" + req.retries + " " + req.cmd + " params:" + req.uReq.getParams());
     }
@@ -266,9 +280,15 @@ public class SolrCmdDistributor implements Closeable {
         }
 
         @Override
-        public void onFailure(Throwable t) {
-          log.error("Exception sending dist update", t);
+        public void onFailure(Throwable t, int code) {
+          log.error("Exception sending dist update {}", code, t);
           cancels.remove(cancelIndex);
+
+          if (code == 404) {
+            cancelExeption = t;
+            return;
+          }
+
           Error error = new Error();
           error.t = t;
           error.req = req;
diff --git a/solr/core/src/test/org/apache/solr/update/MockingHttp2SolrClient.java b/solr/core/src/test/org/apache/solr/update/MockingHttp2SolrClient.java
index 93bbc64..aed9391 100644
--- a/solr/core/src/test/org/apache/solr/update/MockingHttp2SolrClient.java
+++ b/solr/core/src/test/org/apache/solr/update/MockingHttp2SolrClient.java
@@ -146,7 +146,7 @@ public class MockingHttp2SolrClient extends Http2SolrClient {
           e = new SolrServerException(e);
         }
       }
-      asyncListener.onFailure(e);
+      asyncListener.onFailure(e, 500);
     }
 
     return super.asyncRequest(request, collection, asyncListener);
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudHttp2SolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudHttp2SolrClient.java
index d9c8fe9..359a4d8 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudHttp2SolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudHttp2SolrClient.java
@@ -331,7 +331,7 @@ public class CloudHttp2SolrClient extends BaseCloudSolrClient {
     }
 
     @Override
-    public void onFailure(Throwable t) {
+    public void onFailure(Throwable t, int code) {
       tsExceptions.put(url, t);
       latch.countDown();
     }
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/Http2SolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/Http2SolrClient.java
index 6298773..945fb62 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/Http2SolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/Http2SolrClient.java
@@ -443,7 +443,7 @@ public class Http2SolrClient extends SolrClient {
         req.request.idleTimeout(idleTimeout, TimeUnit.MILLISECONDS);
       }
     } catch (Exception e) {
-      asyncListener.onFailure(e);
+      asyncListener.onFailure(e, 500);
       return FAILED_MAKING_REQUEST_CANCELLABLE;
     }
     final ResponseParser parser = solrRequest.getResponseParser() == null
@@ -470,7 +470,7 @@ public class Http2SolrClient extends SolrClient {
 
             } catch (Exception e) {
               if (SolrException.getRootCause(e) != CANCELLED_EXCEPTION) {
-                asyncListener.onFailure(e);
+                asyncListener.onFailure(e, 500);
               }
             } finally {
               arrived = true;
@@ -485,7 +485,7 @@ public class Http2SolrClient extends SolrClient {
           super.onFailure(response, failure);
           try {
             if (SolrException.getRootCause(failure) != CANCELLED_EXCEPTION) {
-              asyncListener.onFailure(failure);
+              asyncListener.onFailure(failure, response.getStatus());
             } else {
               asyncListener.onSuccess(new NamedList<>());
             }
@@ -521,7 +521,7 @@ public class Http2SolrClient extends SolrClient {
     } catch (Exception e) {
 
       if (e != CANCELLED_EXCEPTION) {
-        asyncListener.onFailure(e);
+        asyncListener.onFailure(e, 500);
       }
       //log.info("UNREGISTER TRACKER");
      // asyncTracker.arrive();
@@ -540,14 +540,14 @@ public class Http2SolrClient extends SolrClient {
     try {
       req = makeRequest(solrRequest, collection);
     } catch (Exception e) {
-      asyncListener.onFailure(e);
+      asyncListener.onFailure(e, 500);
       return FAILED_MAKING_REQUEST_CANCELLABLE;
     }
     MyInputStreamResponseListener mysl = new MyInputStreamResponseListener(httpClient, asyncListener);
     try {
       req.request.send(mysl);
     } catch (Exception e) {
-      asyncListener.onFailure(e);
+      asyncListener.onFailure(e, 500);
 
       throw new SolrException(SolrException.ErrorCode.UNKNOWN, e);
     }
@@ -1461,7 +1461,7 @@ public class Http2SolrClient extends SolrClient {
     public void onFailure(Response response, Throwable failure) {
       super.onFailure(response, failure);
       try {
-        asyncListener.onFailure(new SolrServerException(failure.getMessage(), failure));
+        asyncListener.onFailure(new SolrServerException(failure.getMessage(), failure), response.getStatus());
       } catch (Exception e) {
         log.error("Exception in async failure listener", e);
       }
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java
index 13a42ef..756f239 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java
@@ -120,7 +120,7 @@ public class LBHttp2SolrClient extends LBSolrClient {
           try {
             url = it.nextOrError(e);
           } catch (Exception ex) {
-            asyncListener.onFailure(e);
+            asyncListener.onFailure(e, 500);
             return;
           }
           try {
@@ -136,7 +136,7 @@ public class LBHttp2SolrClient extends LBSolrClient {
             MDC.remove("LBSolrClient.url");
           }
         } else {
-          asyncListener.onFailure(e);
+          asyncListener.onFailure(e, 500);
         }
       }
     };
@@ -144,7 +144,7 @@ public class LBHttp2SolrClient extends LBSolrClient {
       Cancellable cancellable = doRequest(it.nextOrError(), req, rsp, isNonRetryable, it.isServingZombieServer(), retryListener);
       currentCancellable.set(cancellable);
     } catch (SolrServerException e) {
-      asyncListener.onFailure(e);
+      asyncListener.onFailure(e, 500);
     }
     return () -> {
 
@@ -177,7 +177,7 @@ public class LBHttp2SolrClient extends LBSolrClient {
       }
 
       @Override
-      public void onFailure(Throwable oe) {
+      public void onFailure(Throwable oe, int code) {
         try {
           throw (Exception) oe;
         } catch (BaseHttpSolrClient.RemoteExecutionException e) {
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/util/AsyncListener.java b/solr/solrj/src/java/org/apache/solr/client/solrj/util/AsyncListener.java
index 1ddd41e..79c07f0 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/util/AsyncListener.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/util/AsyncListener.java
@@ -31,6 +31,6 @@ public interface AsyncListener<T> {
   }
 
   void onSuccess(T t);
-  void onFailure(Throwable throwable);
+  void onFailure(Throwable throwable, int code);
 
 }


[lucene-solr] 08/16: @1284 Allow ACTIVE as well again.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 57e1627bec9370d3d5dd33bb68532a47660fd82f
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jan 21 21:52:17 2021 -0600

    @1284 Allow ACTIVE as well again.
---
 solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
index 63418b4..5bd51d8 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
@@ -78,7 +78,7 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
         final Replica replica = c.getReplica(cname);
 
         if (replica != null) {
-          if (replica.getState() == waitForState && coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName())) {
+          if ((replica.getState() == waitForState || replica.getState() == Replica.State.ACTIVE) && coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName())) {
             // if (log.isDebugEnabled()) log.debug("replica={} state={} waitForState={}", replica, replica.getState(), waitForState);
             log.info("replica={} state={} waitForState={}", replica, replica.getState(), waitForState);
             return true;


[lucene-solr] 10/16: @1286 Better error message.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 2830a37c06864e94d3dedb4bbd20b265b00cb3d5
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jan 21 22:24:15 2021 -0600

    @1286 Better error message.
---
 .../src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java     | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
index 26771ea..b463137 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
@@ -76,15 +76,16 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
         // wait until we are sure the recovering node is ready
         // to accept updates
         final Replica replica = c.getReplica(cname);
-
+        boolean isLive = false;
         if (replica != null) {
-          if ((replica.getState() == waitForState || replica.getState() == Replica.State.ACTIVE) && coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName())) {
+          isLive = coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName());
+          if ((replica.getState() == waitForState || replica.getState() == Replica.State.ACTIVE) && isLive) {
             // if (log.isDebugEnabled()) log.debug("replica={} state={} waitForState={}", replica, replica.getState(), waitForState);
             log.info("replica={} state={} waitForState={} isLive={}", replica, replica.getState(), waitForState, coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName()));
             return true;
           }
         }
-
+        errorMessage.set("Timeout waiting to see RECOVERY state replica=" + replica + " state=" + replica.getState() + " waitForState=" + waitForState + " isLive=" + isLive + "\n" + coreContainer.getZkController().getZkStateReader().getClusterState().getCollectionOrNull(collection));
         return false;
       });
 


[lucene-solr] 07/16: @1283 Don't cancel recovery.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 84f1dff1be821d46bc3234c62c7dfdc1324c9168
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jan 21 21:50:26 2021 -0600

    @1283 Don't cancel recovery.
---
 solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
index c8f29cd..02ecd79 100644
--- a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
+++ b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
@@ -358,7 +358,7 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
         if (!locked) {
           recoveryWaiting.incrementAndGet();
           if (log.isDebugEnabled()) log.debug("Wait for recovery lock");
-          cancelRecovery(true, false);
+          //cancelRecovery(true, false);
           while (!(recoveryLock.tryLock() || recoveryLock.tryLock(500, TimeUnit.MILLISECONDS))) {
             if (closed || prepForClose) {
               log.warn("Skipping recovery because we are closed");


[lucene-solr] 05/16: @1281 Clean up some MDC core logging.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 65b1ef810d45ea017b20b8daf9e6a8f15832c893
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jan 21 21:25:12 2021 -0600

    @1281 Clean up some MDC core logging.
---
 .../java/org/apache/solr/cloud/LeaderElector.java  | 223 +++++++++++----------
 .../solr/cloud/RecoveringCoreTermWatcher.java      |   6 +-
 .../java/org/apache/solr/cloud/ZkController.java   |  22 +-
 .../java/org/apache/solr/core/CoreContainer.java   |  80 ++++----
 .../solr/handler/admin/CoreAdminOperation.java     |  51 ++---
 .../apache/solr/update/DefaultSolrCoreState.java   |   6 +-
 6 files changed, 206 insertions(+), 182 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
index 1e6dc35..c9b7ffe 100644
--- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
+++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
@@ -25,6 +25,7 @@ import org.apache.solr.common.cloud.SolrZooKeeper;
 import org.apache.solr.common.cloud.ZooKeeperException;
 import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.common.util.ObjectReleaseTracker;
+import org.apache.solr.logging.MDCLoggingContext;
 import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.KeeperException.ConnectionLossException;
@@ -119,141 +120,147 @@ public class LeaderElector implements Closeable {
   private synchronized boolean checkIfIamLeader(final ElectionContext context, boolean replacement) throws KeeperException,
           InterruptedException, IOException {
     //if (checkClosed(context)) return false;
-
-    if (log.isDebugEnabled()) log.debug("Check if I am leader {}", context.getClass().getSimpleName());
-    if (isClosed) {
-      log.info("elector is closed, won't join election");
-      return false;
-    }
-
-    executor.submit(() -> {
-      context.checkIfIamLeaderFired();
-    });
-
-    state = CHECK_IF_LEADER;
-    // get all other numbers...
-    final String holdElectionPath = context.electionPath + ELECTION_NODE;
-    List<String> seqs;
-    try {
-      seqs = zkClient.getChildren(holdElectionPath, null, true);
-    } catch (KeeperException.SessionExpiredException e) {
-      log.error("ZooKeeper session has expired");
-      state = OUT_OF_ELECTION;
-      return false;
-    } catch (KeeperException.NoNodeException e) {
-      log.info("the election node disappeared, check if we are the leader again");
-      state = OUT_OF_ELECTION;
-      return false;
-    } catch (KeeperException e) {
-      // we couldn't set our watch for some other reason, retry
-      log.warn("Failed setting election watch, retrying {} {}", e.getClass().getName(), e.getMessage());
-      state = OUT_OF_ELECTION;
-      return true;
-    } catch (Exception e) {
-      // we couldn't set our watch for some other reason, retry
-      log.error("Failed on election getchildren call {} {}", e.getClass().getName(), e.getMessage());
-      state = OUT_OF_ELECTION;
-      return true;
-    }
-
+    MDCLoggingContext.setCoreName(context.leaderProps.getName());
     try {
+      if (log.isDebugEnabled()) log.debug("Check if I am leader {}", context.getClass().getSimpleName());
+      if (isClosed) {
+        log.info("elector is closed, won't join election");
+        return false;
+      }
 
-      sortSeqs(seqs);
+      executor.submit(() -> {
+        context.checkIfIamLeaderFired();
+      });
 
-      String leaderSeqNodeName;
+      state = CHECK_IF_LEADER;
+      // get all other numbers...
+      final String holdElectionPath = context.electionPath + ELECTION_NODE;
+      List<String> seqs;
       try {
-        leaderSeqNodeName = context.leaderSeqPath.substring(context.leaderSeqPath.lastIndexOf('/') + 1);
-      } catch (NullPointerException e) {
+        seqs = zkClient.getChildren(holdElectionPath, null, true);
+      } catch (KeeperException.SessionExpiredException e) {
+        log.error("ZooKeeper session has expired");
         state = OUT_OF_ELECTION;
-        if (log.isDebugEnabled()) log.debug("leaderSeqPath has been removed, bailing");
         return true;
-      }
-      if (!seqs.contains(leaderSeqNodeName)) {
-        log.warn("Our node is no longer in line to be leader");
+      } catch (KeeperException.NoNodeException e) {
+        log.info("the election node disappeared");
         state = OUT_OF_ELECTION;
         return false;
+      } catch (KeeperException e) {
+        // we couldn't set our watch for some other reason, retry
+        log.warn("Failed setting election watch, retrying {} {}", e.getClass().getName(), e.getMessage());
+        state = OUT_OF_ELECTION;
+        return true;
+      } catch (Exception e) {
+        // we couldn't set our watch for some other reason, retry
+        log.error("Failed on election getchildren call {} {}", e.getClass().getName(), e.getMessage());
+        state = OUT_OF_ELECTION;
+        return true;
       }
-      if (log.isDebugEnabled()) log.debug("The leader election node is {}", leaderSeqNodeName);
-      if (leaderSeqNodeName.equals(seqs.get(0))) {
-        // I am the leader
-        if (log.isDebugEnabled()) log.debug("I am the potential leader {}, running leader process", context.leaderProps.getName());
-        ElectionWatcher oldWatcher = watcher;
-        if (oldWatcher != null) {
-          oldWatcher.close();
-        }
-
-        if ((zkController != null && zkController.getCoreContainer().isShutDown())) {
-          if (log.isDebugEnabled()) log.debug("Elector is closed, will not try and run leader processes");
-          state = OUT_OF_ELECTION;
-          return false;
-        }
 
-        state = POT_LEADER;
-        runIamLeaderProcess(context, replacement);
-        return false;
+      try {
 
-      } else {
+        sortSeqs(seqs);
 
-        String toWatch = seqs.get(0);
-        for (String node : seqs) {
-          if (leaderSeqNodeName.equals(node)) {
-            break;
-          }
-          toWatch = node;
-        }
+        String leaderSeqNodeName;
         try {
-          String watchedNode = holdElectionPath + "/" + toWatch;
-
-          log.info("I am not the leader (our path is ={}) - watch the node below me {} seqs={}", leaderSeqNodeName, watchedNode, seqs);
-
+          leaderSeqNodeName = context.leaderSeqPath.substring(context.leaderSeqPath.lastIndexOf('/') + 1);
+        } catch (NullPointerException e) {
+          state = OUT_OF_ELECTION;
+          if (log.isDebugEnabled()) log.debug("leaderSeqPath has been removed, bailing");
+          return true;
+        }
+        if (!seqs.contains(leaderSeqNodeName)) {
+          log.warn("Our node is no longer in line to be leader");
+          state = OUT_OF_ELECTION;
+          return false;
+        }
+        if (log.isDebugEnabled()) log.debug("The leader election node is {}", leaderSeqNodeName);
+        if (leaderSeqNodeName.equals(seqs.get(0))) {
+          // I am the leader
+          if (log.isDebugEnabled()) log.debug("I am the potential leader {}, running leader process", context.leaderProps.getName());
           ElectionWatcher oldWatcher = watcher;
           if (oldWatcher != null) {
-            IOUtils.closeQuietly(oldWatcher);
+            oldWatcher.close();
           }
 
-          watcher = new ElectionWatcher(context.leaderSeqPath, watchedNode, context);
-          Stat exists = zkClient.exists(watchedNode, watcher);
-          if (exists == null) {
+          if ((zkController != null && zkController.getCoreContainer().isShutDown())) {
+            if (log.isDebugEnabled()) log.debug("Elector is closed, will not try and run leader processes");
             state = OUT_OF_ELECTION;
-            return true;
+            return false;
           }
 
-          state = WAITING_IN_ELECTION;
-          if (log.isDebugEnabled()) log.debug("Watching path {} to know if I could be the leader, my node is {}", watchedNode, context.leaderSeqPath);
-
+          state = POT_LEADER;
+          runIamLeaderProcess(context, replacement);
           return false;
-        } catch (KeeperException.SessionExpiredException e) {
-          state = OUT_OF_ELECTION;
-          log.error("ZooKeeper session has expired");
-          throw e;
-        } catch (KeeperException.NoNodeException e) {
-          log.info("the previous node disappeared, check if we are the leader again");
 
-        } catch (KeeperException e) {
-          // we couldn't set our watch for some other reason, retry
-          log.warn("Failed setting election watch, retrying {} {}", e.getClass().getName(), e.getMessage());
+        } else {
+
+          String toWatch = seqs.get(0);
+          for (String node : seqs) {
+            if (leaderSeqNodeName.equals(node)) {
+              break;
+            }
+            toWatch = node;
+          }
+          try {
+            String watchedNode = holdElectionPath + "/" + toWatch;
 
-        } catch (Exception e) {
-          state = OUT_OF_ELECTION;
-          // we couldn't set our watch for some other reason, retry
-          log.error("Failed setting election watch {} {}", e.getClass().getName(), e.getMessage());
+            log.info("I am not the leader (our path is ={}) - watch the node below me {} seqs={}", leaderSeqNodeName, watchedNode, seqs);
+
+            ElectionWatcher oldWatcher = watcher;
+            if (oldWatcher != null) {
+              IOUtils.closeQuietly(oldWatcher);
+            }
+
+            watcher = new ElectionWatcher(context.leaderSeqPath, watchedNode, context);
+            Stat exists = zkClient.exists(watchedNode, watcher);
+            if (exists == null) {
+              state = OUT_OF_ELECTION;
+              return true;
+            }
+
+            state = WAITING_IN_ELECTION;
+            if (log.isDebugEnabled()) log.debug("Watching path {} to know if I could be the leader, my node is {}", watchedNode, context.leaderSeqPath);
+
+            return false;
+          } catch (KeeperException.SessionExpiredException e) {
+            state = OUT_OF_ELECTION;
+            log.error("ZooKeeper session has expired");
+            return true;
+          } catch (KeeperException.NoNodeException e) {
+            log.info("the previous node disappeared, check if we are the leader again");
+            state = OUT_OF_ELECTION;
+            return true;
+          } catch (KeeperException e) {
+            // we couldn't set our watch for some other reason, retry
+            log.warn("Failed setting election watch, retrying {} {}", e.getClass().getName(), e.getMessage());
+            state = OUT_OF_ELECTION;
+            return true;
+          } catch (Exception e) {
+            state = OUT_OF_ELECTION;
+            // we couldn't set our watch for some other reason, retry
+            log.error("Failed setting election watch {} {}", e.getClass().getName(), e.getMessage());
+            state = OUT_OF_ELECTION;
+            return true;
+          }
         }
+
+      } catch (KeeperException.SessionExpiredException e) {
+        log.error("ZooKeeper session has expired");
+        state = OUT_OF_ELECTION;
+        return true;
+      } catch (AlreadyClosedException e) {
+        state = OUT_OF_ELECTION;
+        return true;
+      } catch (Exception e) {
+        state = OUT_OF_ELECTION;
+        return true;
       }
 
-    } catch (KeeperException.SessionExpiredException e) {
-      log.error("ZooKeeper session has expired");
-      state = OUT_OF_ELECTION;
-      return false;
-    } catch (AlreadyClosedException e) {
-      state = OUT_OF_ELECTION;
-      return false;
-    } catch (Exception e) {
-      state = OUT_OF_ELECTION;
-      return true;
+    } finally {
+      MDCLoggingContext.clear();
     }
-
-    state = OUT_OF_ELECTION;
-    return true;
   }
 
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveringCoreTermWatcher.java b/solr/core/src/java/org/apache/solr/cloud/RecoveringCoreTermWatcher.java
index 5416b89..3a4d371 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveringCoreTermWatcher.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveringCoreTermWatcher.java
@@ -22,6 +22,7 @@ import org.apache.solr.common.ParWork;
 import org.apache.solr.core.CoreContainer;
 import org.apache.solr.core.CoreDescriptor;
 import org.apache.solr.core.SolrCore;
+import org.apache.solr.logging.MDCLoggingContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -50,8 +51,9 @@ public class RecoveringCoreTermWatcher implements ZkShardTerms.CoreTermWatcher,
   @Override
   public boolean onTermChanged(ShardTerms terms) {
     if (coreContainer.isShutDown()) return false;
+    MDCLoggingContext.setCoreDescriptor(coreContainer, coreDescriptor);
 
-     try {
+    try {
       if (closed) {
         return false;
       }
@@ -74,6 +76,8 @@ public class RecoveringCoreTermWatcher implements ZkShardTerms.CoreTermWatcher,
         log.info("Failed to watch term of core {}", coreDescriptor.getName(), e);
       }
       return false;
+    } finally {
+      MDCLoggingContext.clear();
     }
 
     return true;
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 1f4ecad..a5a6b19 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -1722,17 +1722,17 @@ public class ZkController implements Closeable, Runnable {
    */
   public void publish(final CoreDescriptor cd, final Replica.State state, boolean updateLastState) throws Exception {
     MDCLoggingContext.setCoreDescriptor(cc, cd);
-    log.info("publishing state={}", state);
-    try (SolrCore core = cc.getCore(cd.getName())) {
-      if ((state == Replica.State.ACTIVE || state == Replica.State.RECOVERING) && (isClosed() || (core != null && core.isClosing()))) {
-        log.info("already closed, won't publish state={}", state);
-        throw new AlreadyClosedException();
+    try {
+      log.info("publishing state={}", state);
+      try (SolrCore core = cc.getCore(cd.getName())) {
+        if ((state == Replica.State.ACTIVE || state == Replica.State.RECOVERING) && (isClosed() || (core != null && core.isClosing()))) {
+          log.info("already closed, won't publish state={}", state);
+          throw new AlreadyClosedException();
+        }
       }
-    }
 
-    // nocommit TODO if we publish anything but ACTIVE, cancel any possible election?
+      // nocommit TODO if we publish anything but ACTIVE, cancel any possible election?
 
-    try {
       String collection = cd.getCloudDescriptor().getCollectionName();
 
       // System.out.println(Thread.currentThread().getStackTrace()[3]);
@@ -1747,10 +1747,10 @@ public class ZkController implements Closeable, Runnable {
       Map<String,Object> props = new HashMap<>();
       props.put(Overseer.QUEUE_OPERATION, "state");
       props.put(ZkStateReader.STATE_PROP, state.toString());
-    //  props.put(ZkStateReader.ROLES_PROP, cd.getCloudDescriptor().getRoles());
+      //  props.put(ZkStateReader.ROLES_PROP, cd.getCloudDescriptor().getRoles());
       props.put(CORE_NAME_PROP, cd.getName());
-    //  props.put(ZkStateReader.NODE_NAME_PROP, getNodeName());
-    //  props.put(ZkStateReader.SHARD_ID_PROP, cd.getCloudDescriptor().getShardId());
+      //  props.put(ZkStateReader.NODE_NAME_PROP, getNodeName());
+      //  props.put(ZkStateReader.SHARD_ID_PROP, cd.getCloudDescriptor().getShardId());
       props.put(ZkStateReader.COLLECTION_PROP, collection);
       props.put(ZkStateReader.REPLICA_TYPE, cd.getCloudDescriptor().getReplicaType().toString());
 
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index 7a0c006..54248c2 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -104,6 +104,7 @@ import org.apache.solr.util.SystemIdResolver;
 import org.apache.zookeeper.KeeperException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.slf4j.MDC;
 
 import static java.util.Objects.requireNonNull;
 import static org.apache.solr.common.params.CommonParams.AUTHC_PATH;
@@ -920,21 +921,25 @@ public class CoreContainer implements Closeable {
 
           coreLoadFutures.add(solrCoreLoadExecutor.submit(() -> {
             SolrCore core;
+            MDCLoggingContext.setCoreDescriptor(this, cd);
             try {
+              try {
 
-              core = createFromDescriptor(cd, false);
+                core = createFromDescriptor(cd, false);
 
-              if (core.getDirectoryFactory().isSharedStorage()) {
-                if (isZooKeeperAware()) {
-                  zkSys.getZkController().throwErrorIfReplicaReplaced(cd);
+                if (core.getDirectoryFactory().isSharedStorage()) {
+                  if (isZooKeeperAware()) {
+                    zkSys.getZkController().throwErrorIfReplicaReplaced(cd);
+                  }
                 }
-              }
 
+              } finally {
+                solrCores.markCoreAsNotLoading(cd);
+              } if (isZooKeeperAware()) {
+                new ZkController.RegisterCoreAsync(zkSys.zkController, cd, false).call();
+              }
             } finally {
-              solrCores.markCoreAsNotLoading(cd);
-            }
-            if (isZooKeeperAware()) {
-              new ZkController.RegisterCoreAsync(zkSys.zkController, cd, false).call();
+              MDCLoggingContext.clear();
             }
             return core;
           }));
@@ -1406,8 +1411,8 @@ public class CoreContainer implements Closeable {
       ConfigSet coreConfig = coreConfigService.loadConfigSet(dcore);
       dcore.setConfigSetTrusted(coreConfig.isTrusted());
       if (log.isInfoEnabled()) {
-        log.info("Creating SolrCore '{}' using configuration from {} solrconfig={}, trusted={}", dcore.getName(),
-            coreConfig.getName(), coreConfig.getSolrConfig().getName(), dcore.isConfigSetTrusted());
+        log.info("Creating SolrCore '{}' using configuration from {} solrconfig={}, trusted={}", dcore.getName(), coreConfig.getName(), coreConfig.getSolrConfig().getName(),
+            dcore.isConfigSetTrusted());
       }
 
       try {
@@ -1429,14 +1434,13 @@ public class CoreContainer implements Closeable {
 
         old = registerCore(dcore, core, true);
         registered = true;
-      } catch (Exception e){
+      } catch (Exception e) {
 
         throw new SolrException(ErrorCode.SERVER_ERROR, e);
       } finally {
         solrCores.markCoreAsNotLoading(dcore);
       }
 
-
       // always kick off recovery if we are in non-Cloud mode
       if (!isZooKeeperAware() && core.getUpdateHandler().getUpdateLog() != null) {
         core.getUpdateHandler().getUpdateLog().recoverFromLog();
@@ -1452,9 +1456,9 @@ public class CoreContainer implements Closeable {
         unload(dcore.getName(), true, true, true);
         throw e;
       }
-//      if (!registered) {
-//        solrCores.removeCoreDescriptor(dcore);
-//      }
+      //      if (!registered) {
+      //        solrCores.removeCoreDescriptor(dcore);
+      //      }
       final SolrException solrException = new SolrException(ErrorCode.SERVER_ERROR, "Unable to create core [" + dcore.getName() + "]", e);
       throw solrException;
     } catch (Throwable t) {
@@ -1465,33 +1469,35 @@ public class CoreContainer implements Closeable {
 
       throw t;
     } finally {
-
-      if (core != null) {
-        if (!registered) {
-          if (core != null) {
-
+      try {
+        if (core != null) {
+          if (!registered) {
+            if (core != null) {
+
+              SolrCore finalCore1 = core;
+              solrCoreCloseExecutor.submit(() -> {
+                finalCore1.closeAndWait();
+              });
+              SolrCore finalOld = old;
+              solrCoreCloseExecutor.submit(() -> {
+                if (finalOld != null) {
+                  finalOld.closeAndWait();
+                }
+              });
+            }
+          }
+          if (isShutDown) {
             SolrCore finalCore1 = core;
-            solrCoreCloseExecutor.submit(() -> {
+            ParWork.getRootSharedExecutor().submit(() -> {
+
               finalCore1.closeAndWait();
-            });
-            SolrCore finalOld = old;
-            solrCoreCloseExecutor.submit(() -> {
-              if (finalOld != null) {
-                finalOld.closeAndWait();
-              }
+
             });
           }
         }
-        if (isShutDown) {
-          SolrCore finalCore1 = core;
-          ParWork.getRootSharedExecutor().submit(() -> {
-
-            finalCore1.closeAndWait();
-
-          });
-        }
+      } finally {
+        MDCLoggingContext.clear();
       }
-      MDCLoggingContext.clear();
     }
   }
 
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminOperation.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminOperation.java
index 76fab2e..935ee4a 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminOperation.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminOperation.java
@@ -43,6 +43,7 @@ import org.apache.solr.core.snapshots.SolrSnapshotManager;
 import org.apache.solr.core.snapshots.SolrSnapshotMetaDataManager;
 import org.apache.solr.core.snapshots.SolrSnapshotMetaDataManager.SnapshotMetaData;
 import org.apache.solr.handler.admin.CoreAdminHandler.CoreAdminOp;
+import org.apache.solr.logging.MDCLoggingContext;
 import org.apache.solr.metrics.SolrMetricManager;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.update.UpdateLog;
@@ -72,35 +73,39 @@ import static org.apache.solr.handler.admin.CoreAdminHandler.normalizePath;
 enum CoreAdminOperation implements CoreAdminOp {
 
   CREATE_OP(CREATE, it -> {
-    assert TestInjection.injectRandomDelayInCoreCreation();
-
     SolrParams params = it.req.getParams();
     log().info("core create command {}", params);
     String coreName = params.required().get(CoreAdminParams.NAME);
-    Map<String, String> coreParams = buildCoreParams(params);
-    CoreContainer coreContainer = it.handler.coreContainer;
-    Path instancePath;
-
-    // TODO: Should we nuke setting odd instance paths?  They break core discovery, generally
-    String instanceDir = it.req.getParams().get(CoreAdminParams.INSTANCE_DIR);
-    if (instanceDir == null)
-      instanceDir = it.req.getParams().get("property.instanceDir");
-    if (instanceDir != null) {
-      instanceDir = PropertiesUtil.substituteProperty(instanceDir, coreContainer.getContainerProperties());
-      instancePath = coreContainer.getCoreRootDirectory().resolve(instanceDir).normalize();
-    } else {
-      instancePath = coreContainer.getCoreRootDirectory().resolve(coreName);
-    }
+    MDCLoggingContext.setCoreName(coreName);
+    try {
+      assert TestInjection.injectRandomDelayInCoreCreation();
+
+      Map<String,String> coreParams = buildCoreParams(params);
+      CoreContainer coreContainer = it.handler.coreContainer;
+      Path instancePath;
+
+      // TODO: Should we nuke setting odd instance paths?  They break core discovery, generally
+      String instanceDir = it.req.getParams().get(CoreAdminParams.INSTANCE_DIR);
+      if (instanceDir == null) instanceDir = it.req.getParams().get("property.instanceDir");
+      if (instanceDir != null) {
+        instanceDir = PropertiesUtil.substituteProperty(instanceDir, coreContainer.getContainerProperties());
+        instancePath = coreContainer.getCoreRootDirectory().resolve(instanceDir).normalize();
+      } else {
+        instancePath = coreContainer.getCoreRootDirectory().resolve(coreName);
+      }
 
-    boolean newCollection = params.getBool(CoreAdminParams.NEW_COLLECTION, false);
-    if (coreContainer.isShutDown()) {
-      log().warn("Will not create SolrCore, CoreContainer is shutdown");
-      throw new AlreadyClosedException("Will not create SolrCore, CoreContainer is shutdown");
-    }
+      boolean newCollection = params.getBool(CoreAdminParams.NEW_COLLECTION, false);
+      if (coreContainer.isShutDown()) {
+        log().warn("Will not create SolrCore, CoreContainer is shutdown");
+        throw new AlreadyClosedException("Will not create SolrCore, CoreContainer is shutdown");
+      }
 
-    coreContainer.create(coreName, instancePath, coreParams, newCollection);
+      coreContainer.create(coreName, instancePath, coreParams, newCollection);
 
-    it.rsp.add("core", coreName);
+      it.rsp.add("core", coreName);
+    } finally {
+      MDCLoggingContext.clear();
+    }
   }),
   UNLOAD_OP(UNLOAD, it -> {
     SolrParams params = it.req.getParams();
diff --git a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
index 7c1f4f5..c8f29cd 100644
--- a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
+++ b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
@@ -320,13 +320,14 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
     CoreContainer corecontainer = core.getCoreContainer();
 
     Runnable recoveryTask = () -> {
+      CoreDescriptor coreDescriptor = core.getCoreDescriptor();
+      MDCLoggingContext.setCoreDescriptor(corecontainer, coreDescriptor);
       try {
         if (SKIP_AUTO_RECOVERY) {
           log.warn("Skipping recovery according to sys prop solrcloud.skip.autorecovery");
           return;
         }
-        CoreDescriptor coreDescriptor = core.getCoreDescriptor();
-        MDCLoggingContext.setCoreDescriptor(corecontainer, coreDescriptor);
+
         if (log.isDebugEnabled()) log.debug("Going to create and run RecoveryStrategy");
 
 //        try {
@@ -420,6 +421,7 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
       if (!success) {
         recoverying = false;
       }
+
     }
   }
 


[lucene-solr] 06/16: @1282 Fix long to int for real.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 5351ab312fbaa09596f525d27f4f7c84448614cd
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jan 21 21:28:08 2021 -0600

    @1282 Fix long to int for real.
---
 solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java b/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java
index c8f120e..f6b6f13 100644
--- a/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java
+++ b/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java
@@ -87,7 +87,7 @@ public class SolrQoSFilter extends QoSFilter {
             if (log.isDebugEnabled()) log.debug("set max concurrent requests to orig value {}", _origMaxRequests);
             updateMaxRequests(_origMaxRequests, sLoad, ourLoad);
           } else {
-            updateMaxRequests(Math.min(_origMaxRequests, (int) Math.round(getMaxRequests() * 3)), sLoad, ourLoad);
+            updateMaxRequests(Math.min(_origMaxRequests, getMaxRequests() * 3), sLoad, ourLoad);
           }
         } else {
           if (ourLoad > 0.90 && sLoad > 1.5) {
@@ -102,7 +102,7 @@ public class SolrQoSFilter extends QoSFilter {
               if (log.isDebugEnabled()) log.debug("set max concurrent requests to orig value {}", _origMaxRequests);
               updateMaxRequests(_origMaxRequests, sLoad, ourLoad);
             } else {
-              updateMaxRequests(Math.min(_origMaxRequests, (int) Math.round(getMaxRequests() * 3)), sLoad, ourLoad);
+              updateMaxRequests(Math.min(_origMaxRequests, getMaxRequests() * 3), sLoad, ourLoad);
             }
 
           }


[lucene-solr] 11/16: @1287 Tweak prep recovery cmd.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 94dbe77314fb48c6dcb4013a1b0a3fbd014ca16d
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jan 21 22:33:38 2021 -0600

    @1287 Tweak prep recovery cmd.
---
 solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index 8eb64a9..ff6b669 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -923,19 +923,19 @@ public class RecoveryStrategy implements Runnable, Closeable {
     log.info("Sending prep recovery command to {} for core {} params={}", leaderBaseUrl, leaderCoreName, prepCmd.getParams());
 
     int conflictWaitMs = zkController.getLeaderConflictResolveWait();
-    int readTimeout = conflictWaitMs + Integer.parseInt(System.getProperty("prepRecoveryReadTimeoutExtraWait", "5000"));
+    int readTimeout = conflictWaitMs + Integer.parseInt(System.getProperty("prepRecoveryReadTimeoutExtraWait", "10000"));
     // nocommit
     try (Http2SolrClient client = new Http2SolrClient.Builder(leaderBaseUrl).withHttpClient(cc.getUpdateShardHandler().
         getRecoveryOnlyClient()).idleTimeout(readTimeout).markInternalRequest().build()) {
 
       prepCmd.setBasePath(leaderBaseUrl);
-      log.info("Sending prep recovery command to [{}]; [{}]", leaderBaseUrl, prepCmd);
+
       latch = new CountDownLatch(1);
       Cancellable result = client.asyncRequest(prepCmd, null, new NamedListAsyncListener(latch));
       try {
         prevSendPreRecoveryHttpUriRequest = result;
         try {
-          boolean success = latch.await(5, TimeUnit.SECONDS);
+          boolean success = latch.await(15, TimeUnit.SECONDS);
           if (!success) {
             result.cancel();
             throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Timeout waiting for prep recovery cmd on leader");


[lucene-solr] 03/16: @1279 Core replacing only if using shared storage.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 7a853752a08d6eb50e79084cacc520026b5aa8e2
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jan 21 19:52:45 2021 -0600

    @1279 Core replacing only if using shared storage.
---
 solr/core/src/java/org/apache/solr/core/CoreContainer.java | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index f7be9a1..7a0c006 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -921,11 +921,15 @@ public class CoreContainer implements Closeable {
           coreLoadFutures.add(solrCoreLoadExecutor.submit(() -> {
             SolrCore core;
             try {
-              if (isZooKeeperAware()) {
-                zkSys.getZkController().throwErrorIfReplicaReplaced(cd);
-              }
+
               core = createFromDescriptor(cd, false);
 
+              if (core.getDirectoryFactory().isSharedStorage()) {
+                if (isZooKeeperAware()) {
+                  zkSys.getZkController().throwErrorIfReplicaReplaced(cd);
+                }
+              }
+
             } finally {
               solrCores.markCoreAsNotLoading(cd);
             }


[lucene-solr] 09/16: @1285 Cleanup

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 1f2b383dac2bf4ff046ceab79c5d5703066dde28
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jan 21 22:15:03 2021 -0600

    @1285 Cleanup
---
 .../solr/cloud/ShardLeaderElectionContext.java     | 22 +++++++++++++---------
 .../solr/handler/admin/CoreAdminHandler.java       |  1 +
 .../apache/solr/handler/admin/PrepRecoveryOp.java  |  2 +-
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
index feabe53..93cfb38 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
@@ -23,7 +23,6 @@ import java.util.concurrent.TimeUnit;
 
 import org.apache.solr.cloud.overseer.OverseerAction;
 import org.apache.solr.common.AlreadyClosedException;
-import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.cloud.ClusterState;
@@ -102,10 +101,12 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
     try (SolrCore core = cc.getCore(coreName)) {
       if (core == null) {
         log.error("No SolrCore found, cannot become leader {}", coreName);
-        throw new SolrException(ErrorCode.SERVER_ERROR, "No SolrCore found, cannot become leader " + coreName);
+        throw new AlreadyClosedException("No SolrCore found, cannot become leader " + coreName);
       }
       if (core.isClosing() || core.getCoreContainer().isShutDown()) {
         log.info("We are closed, will not become leader");
+        closed = true;
+        cancelElection();
         return false;
       }
       try {
@@ -255,15 +256,15 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
 
         zkController.publish(zkNodes);
 
-      } catch (AlreadyClosedException | InterruptedException e) {
-        ParWork.propagateInterrupt("Already closed or interrupted, bailing..", e);
-        throw new SolrException(ErrorCode.SERVER_ERROR, e);
-      } catch (SessionExpiredException e) {
-        SolrException.log(log, "SessionExpired", e);
+      } catch (AlreadyClosedException e) {
+        log.info("Already closed or interrupted, bailing..", e);
         throw e;
+      } catch (InterruptedException e) {
+        log.warn("Already closed or interrupted, bailing..", e);
+        throw new SolrException(ErrorCode.SERVER_ERROR, e);
       } catch (Exception e) {
         SolrException.log(log, "There was a problem trying to register as the leader", e);
-        // we could not publish ourselves as leader - try and rejoin election
+        // we could not register ourselves as leader - try and rejoin election
 
         rejoinLeaderElection(core);
         return false;
@@ -271,7 +272,10 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
 
 
     } catch (AlreadyClosedException e) {
-      log.info("CoreContainer is shutting down, won't become leader");
+      log.info("Already closed, won't become leader");
+      closed = true;
+      cancelElection();
+      throw e;
     } finally {
       MDCLoggingContext.clear();
     }
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
index 24c576a..c2fe837 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
@@ -208,6 +208,7 @@ public class CoreAdminHandler extends RequestHandlerBase implements PermissionNa
         } finally {
           MDC.remove("CoreAdminHandler.asyncId");
           MDC.remove("CoreAdminHandler.action");
+          MDCLoggingContext.clear();
         }
       }
     } finally {
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
index 5bd51d8..26771ea 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
@@ -80,7 +80,7 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
         if (replica != null) {
           if ((replica.getState() == waitForState || replica.getState() == Replica.State.ACTIVE) && coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName())) {
             // if (log.isDebugEnabled()) log.debug("replica={} state={} waitForState={}", replica, replica.getState(), waitForState);
-            log.info("replica={} state={} waitForState={}", replica, replica.getState(), waitForState);
+            log.info("replica={} state={} waitForState={} isLive={}", replica, replica.getState(), waitForState, coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName()));
             return true;
           }
         }


[lucene-solr] 14/16: wip

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit f1e301577cade8beeb2f16315c817a7ba06c67fd
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Fri Jan 22 20:59:55 2021 -0600

    wip
---
 .../java/org/apache/solr/cloud/LeaderElector.java  |  23 ++---
 .../org/apache/solr/cloud/RecoveryStrategy.java    |  34 +++----
 .../solr/cloud/ShardLeaderElectionContext.java     |  35 ++++---
 .../java/org/apache/solr/cloud/ZkController.java   |  26 ++---
 .../java/org/apache/solr/core/CoreContainer.java   |  91 +++++++++---------
 .../java/org/apache/solr/handler/IndexFetcher.java |  11 +--
 .../handler/component/RealTimeGetComponent.java    |   1 +
 .../org/apache/solr/update/AddUpdateCommand.java   |  10 +-
 .../src/java/org/apache/solr/update/PeerSync.java  |  30 +++++-
 .../org/apache/solr/update/PeerSyncWithLeader.java |  13 ++-
 .../src/java/org/apache/solr/update/UpdateLog.java | 107 ++++++++++-----------
 .../org/apache/solr/update/UpdateShardHandler.java |   2 +-
 .../CollectionsAPIDistributedZkTest.java           |   1 +
 .../org/apache/solr/common/SolrInputDocument.java  |   3 +-
 .../apache/solr/common/cloud/SolrZooKeeper.java    |  32 +++---
 .../apache/solr/common/cloud/ZkStateReader.java    |  58 +++++------
 16 files changed, 248 insertions(+), 229 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
index c9b7ffe..ac2cbe9 100644
--- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
+++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
@@ -184,11 +184,11 @@ public class LeaderElector implements Closeable {
             oldWatcher.close();
           }
 
-          if ((zkController != null && zkController.getCoreContainer().isShutDown())) {
-            if (log.isDebugEnabled()) log.debug("Elector is closed, will not try and run leader processes");
-            state = OUT_OF_ELECTION;
-            return false;
-          }
+//          if ((zkController != null && zkController.getCoreContainer().isShutDown())) {
+//            if (log.isDebugEnabled()) log.debug("Elector is closed, will not try and run leader processes");
+//            state = OUT_OF_ELECTION;
+//            return false;
+//          }
 
           state = POT_LEADER;
           runIamLeaderProcess(context, replacement);
@@ -267,12 +267,12 @@ public class LeaderElector implements Closeable {
   // TODO: get this core param out of here
   protected void runIamLeaderProcess(final ElectionContext context, boolean weAreReplacement) throws KeeperException,
           InterruptedException, IOException {
-    if (state == CLOSED) {
-      throw new AlreadyClosedException();
-    }
-    if (state == LEADER) {
-      throw new IllegalStateException("Already in leader state");
-    }
+//    if (state == CLOSED) {
+//      throw new AlreadyClosedException();
+//    }
+//    if (state == LEADER) {
+//      throw new IllegalStateException("Already in leader state");
+//    }
 
     boolean success = context.runLeaderProcess(context, weAreReplacement, 0);
 
@@ -280,6 +280,7 @@ public class LeaderElector implements Closeable {
       state = LEADER;
     } else {
       state = OUT_OF_ELECTION;
+      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Failed becoming leader");
     }
   }
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index eb377d6..ee085b2 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -56,7 +56,6 @@ import org.apache.solr.util.plugin.NamedListInitializedPlugin;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import static org.apache.solr.common.cloud.ZkStateReader.COLLECTIONS_ZKNODE;
 import java.io.Closeable;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
@@ -230,10 +229,11 @@ public class RecoveryStrategy implements Runnable, Closeable {
 
     log.info("Attempting to replicate from [{}].", leaderprops);
 
-    final String leaderUrl = getReplicateLeaderUrl(leaderprops, zkStateReader);
-
+    String leaderUrl;
     // send commit
     try {
+      Replica leader = zkController.getZkStateReader().getLeaderRetry(coreDescriptor.getCollectionName(), coreDescriptor.getCloudDescriptor().getShardId(), 1500, false);
+      leaderUrl = leader.getCoreUrl();
       commitOnLeader(leaderUrl);
     } catch (Exception e) {
       log.error("Commit on leader failed", e);
@@ -610,20 +610,6 @@ public class RecoveryStrategy implements Runnable, Closeable {
           return false;
         }
 
-        log.info("Begin buffering updates. core=[{}]", coreName);
-        // recalling buffer updates will drop the old buffer tlog
-        ulog.bufferUpdates();
-
-//        try {
-//          if (prevSendPreRecoveryHttpUriRequest != null) {
-//            prevSendPreRecoveryHttpUriRequest.cancel();
-//          }
-//        } catch (NullPointerException e) {
-//          // okay
-//        }
-       // TODO can we do this with commit on leader
-        sendPrepRecoveryCmd(leader.getBaseUrl(), leader.getName(), zkStateReader.getClusterState().getCollection(coreDescriptor.getCollectionName()).getSlice(cloudDesc.getShardId()));
-
         // we wait a bit so that any updates on the leader
         // that started before they saw recovering state
         // are sure to have finished (see SOLR-7141 for
@@ -684,6 +670,20 @@ public class RecoveryStrategy implements Runnable, Closeable {
 
           try {
 
+            log.info("Begin buffering updates. core=[{}]", coreName);
+            // recalling buffer updates will drop the old buffer tlog
+            ulog.bufferUpdates();
+
+            //        try {
+            //          if (prevSendPreRecoveryHttpUriRequest != null) {
+            //            prevSendPreRecoveryHttpUriRequest.cancel();
+            //          }
+            //        } catch (NullPointerException e) {
+            //          // okay
+            //        }
+            // TODO can we do this with commit on leader
+            sendPrepRecoveryCmd(leader.getBaseUrl(), leader.getName(), zkStateReader.getClusterState().getCollection(coreDescriptor.getCollectionName()).getSlice(cloudDesc.getShardId()));
+
             IndexFetcher.IndexFetchResult result = replicate(leader);
 
             if (result.getSuccessful()) {
diff --git a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
index 93cfb38..f74f5dc 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
@@ -38,7 +38,6 @@ import org.apache.solr.logging.MDCLoggingContext;
 import org.apache.solr.update.PeerSync;
 import org.apache.solr.update.UpdateLog;
 import org.apache.zookeeper.KeeperException;
-import org.apache.zookeeper.KeeperException.SessionExpiredException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -103,19 +102,19 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
         log.error("No SolrCore found, cannot become leader {}", coreName);
         throw new AlreadyClosedException("No SolrCore found, cannot become leader " + coreName);
       }
-      if (core.isClosing() || core.getCoreContainer().isShutDown()) {
-        log.info("We are closed, will not become leader");
-        closed = true;
-        cancelElection();
-        return false;
-      }
+//      if (core.isClosing() || core.getCoreContainer().isShutDown()) {
+//        log.info("We are closed, will not become leader");
+//        closed = true;
+//        cancelElection();
+//        return false;
+//      }
       try {
 
-        core.getSolrCoreState().cancelRecovery(true, false);
+       // core.getSolrCoreState().cancelRecovery(true, false);
 
         ActionThrottle lt;
 
-        MDCLoggingContext.setCore(core);
+      //  MDCLoggingContext.setCore(core);
         lt = core.getUpdateHandler().getSolrCoreState().getLeaderThrottle();
 
         lt.minimumWaitBetweenActions();
@@ -138,7 +137,8 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
         replicaType = cloudCd.getReplicaType();
         // should I be leader?
 
-        ZkShardTerms zkShardTerms = zkController.getShardTerms(collection, shardId);
+        log.info("Check zkShardTerms");
+        ZkShardTerms zkShardTerms = zkController.getShardTermsOrNull(collection, shardId);
         try {
           // if the replica is waiting for leader to see recovery state, the leader should refresh its terms
           if (zkShardTerms != null && zkShardTerms.skipSendingUpdatesTo(coreName)) {
@@ -152,7 +152,7 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
           log.error("Exception while looking at refreshing shard terms", e);
         }
         
-        if (zkShardTerms.registered(coreName) && !zkShardTerms.canBecomeLeader(coreName)) {
+        if (zkShardTerms != null && zkShardTerms.registered(coreName) && !zkShardTerms.canBecomeLeader(coreName)) {
           if (!waitForEligibleBecomeLeaderAfterTimeout(zkShardTerms, coreName, leaderVoteWait)) {
             rejoinLeaderElection(core);
             return false;
@@ -166,9 +166,9 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
 
         PeerSync.PeerSyncResult result = null;
         boolean success = false;
-        if (core.getCoreContainer().isShutDown()) {
-          return false;
-        }
+//        if (core.getCoreContainer().isShutDown()) {
+//          return false;
+//        }
         result = syncStrategy.sync(zkController, core, leaderProps, weAreReplacement);
         log.info("Sync strategy sync result {}", result);
         success = result.isSuccess();
@@ -242,8 +242,11 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
         // in case of leaderVoteWait timeout, a replica with lower term can win the election
         if (setTermToMax) {
           log.error("WARNING: Potential data loss -- Replica {} became leader after timeout (leaderVoteWait) " + "without being up-to-date with the previous leader", coreName);
-          zkController.createCollectionTerms(collection);
-          zkController.getShardTerms(collection, shardId).setTermEqualsToLeader(coreName);
+          try {
+            zkController.getShardTerms(collection, shardId).setTermEqualsToLeader(coreName);
+          } catch (Exception e) {
+            log.error("Exception trying to set shard terms equal to leader", e);
+          }
         }
 
         super.runLeaderProcess(context, weAreReplacement, 0);
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index f146d1b..b2abf8f 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -329,9 +329,8 @@ public class ZkController implements Closeable, Runnable {
     }
 
     public Object call() throws Exception {
-      if (log.isInfoEnabled()) {
-        log.info("Registering core {} afterExpiration? {}", descriptor.getName(), afterExpiration);
-      }
+      log.info("Registering core with ZK {} afterExpiration? {}", descriptor.getName(), afterExpiration);
+
 
       if (zkController.isDcCalled() || zkController.getCoreContainer().isShutDown() || (afterExpiration && !descriptor.getCloudDescriptor().hasRegistered())) {
         return null;
@@ -1422,25 +1421,26 @@ public class ZkController implements Closeable, Runnable {
 
       log.info("Wait to see leader for {}, {}", collection, shardId);
       Replica leader = null;
-      for (int i = 0; i < 30; i++) {
-//        if (leaderElector.isLeader()) {
-//          leader = replica;
-//          break;
-//        }
+      for (int i = 0; i < 15; i++) {
+        if (leaderElector.isLeader()) {
+          leader = replica;
+          break;
+        }
 
         try {
-          if (getCoreContainer().isShutDown() || isDcCalled() || isClosed()) {
-            throw new AlreadyClosedException();
-          }
+//          if (getCoreContainer().isShutDown() || isDcCalled() || isClosed()) {
+//            throw new AlreadyClosedException();
+//          }
 
-          leader = zkStateReader.getLeaderRetry(collection, shardId, 500, false);
+          leader = zkStateReader.getLeaderRetry(collection, shardId, 1500, false);
 
         } catch (TimeoutException timeoutException) {
-
+           log.info("Timeout waiting to see leader, retry");
         }
       }
 
       if (leader == null) {
+        log.error("No leader found while trying to register " + coreName + " with zookeeper");
         throw new SolrException(ErrorCode.SERVER_ERROR, "No leader found while trying to register " + coreName + " with zookeeper");
       }
 
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index ab33eae..26f6a17 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -102,7 +102,6 @@ import org.apache.solr.util.SystemIdResolver;
 import org.apache.zookeeper.KeeperException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.slf4j.MDC;
 
 import static java.util.Objects.requireNonNull;
 import static org.apache.solr.common.params.CommonParams.AUTHC_PATH;
@@ -158,7 +157,6 @@ public class CoreContainer implements Closeable {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   final SolrCores solrCores = new SolrCores(this);
-  private final boolean isZkAware;
   private volatile boolean startedLoadingCores;
   private volatile boolean loaded;
 
@@ -347,19 +345,17 @@ public class CoreContainer implements Closeable {
     assert ObjectReleaseTracker.track(this);
     assert (closeTracker = new CloseTracker()) != null;
     this.containerProperties = new Properties(config.getSolrProperties());
-    String zkHost = System.getProperty("zkHost");
-    if (!StringUtils.isEmpty(zkHost)) {
-      zkSys = new ZkContainer(zkClient);
-      isZkAware = true;
-    } else {
-      isZkAware = false;
-    }
 
     this.loader = config.getSolrResourceLoader();
 
     this.solrHome = config.getSolrHome();
     this.cfg = requireNonNull(config);
 
+    if (zkClient != null) {
+      zkSys = new ZkContainer(zkClient);
+      zkSys.initZooKeeper(this, cfg.getCloudConfig());
+    }
+
     if (null != this.cfg.getBooleanQueryMaxClauseCount()) {
       IndexSearcher.setMaxClauseCount(this.cfg.getBooleanQueryMaxClauseCount());
     }
@@ -403,9 +399,7 @@ public class CoreContainer implements Closeable {
         }
       });
     }
-    if (zkClient != null) {
-      zkSys.initZooKeeper(this, cfg.getCloudConfig());
-    }
+
     coreConfigService = ConfigSetService.createConfigSetService(cfg, loader, zkSys == null ? null : zkSys.zkController);
 
     containerProperties.putAll(cfg.getSolrProperties());
@@ -606,7 +600,6 @@ public class CoreContainer implements Closeable {
     cfg = null;
     containerProperties = null;
     replayUpdatesExecutor = null;
-    isZkAware = false;
   }
 
 
@@ -881,26 +874,26 @@ public class CoreContainer implements Closeable {
     status |= CORE_DISCOVERY_COMPLETE;
     startedLoadingCores = true;
     for (final CoreDescriptor cd : cds) {
-//      if (isZooKeeperAware()) {
-//        String collection = cd.getCollectionName();
-//        try {
-//          zkSys.zkController.zkStateReader.waitForState(collection, 5, TimeUnit.SECONDS, (n, c) -> {
-//            if (c != null) {
-//              Replica replica = c.getReplica(cd.getName());
-//
-//              if (replica.getState().equals(State.DOWN)) {
-//                return true;
-//              }
-//
-//            }
-//            return false;
-//          });
-//        } catch (InterruptedException e) {
-//          ParWork.propagateInterrupt(e);
-//        } catch (TimeoutException e) {
-//          log.error("Timeout", e);
-//        }
-//      }
+      if (isZooKeeperAware()) {
+        String collection = cd.getCollectionName();
+        try {
+          zkSys.zkController.zkStateReader.waitForState(collection, 5, TimeUnit.SECONDS, (n, c) -> {
+            if (c != null) {
+              Replica replica = c.getReplica(cd.getName());
+
+              if (replica.getState().equals(State.DOWN)) {
+                return true;
+              }
+
+            }
+            return false;
+          });
+        } catch (InterruptedException e) {
+          ParWork.propagateInterrupt(e);
+        } catch (TimeoutException e) {
+          log.error("Timeout", e);
+        }
+      }
 
       if (log.isDebugEnabled()) log.debug("Process core descriptor {} {} {}", cd.getName(), cd.isTransient(), cd.isLoadOnStartup());
       if (cd.isTransient() || !cd.isLoadOnStartup()) {
@@ -911,25 +904,20 @@ public class CoreContainer implements Closeable {
       if (cd.isLoadOnStartup()) {
 
         coreLoadFutures.add(solrCoreLoadExecutor.submit(() -> {
-          SolrCore core;
+          SolrCore core = null;
           MDCLoggingContext.setCoreDescriptor(this, cd);
           try {
             try {
 
               core = createFromDescriptor(cd, false);
 
-              if (core.getDirectoryFactory().isSharedStorage()) {
-                if (isZooKeeperAware()) {
-                  zkSys.getZkController().throwErrorIfReplicaReplaced(cd);
-                }
-              }
-
             } finally {
               solrCores.markCoreAsNotLoading(cd);
             }
-            if (isZooKeeperAware()) {
-              new ZkController.RegisterCoreAsync(zkSys.zkController, cd, false).call();
-            }
+
+          } catch (Exception e){
+            log.error("Error creating and register core {}", cd.getName(), e);
+            throw e;
           } finally {
             MDCLoggingContext.clear();
           }
@@ -1409,9 +1397,7 @@ public class CoreContainer implements Closeable {
             throw new AlreadyClosedException("Solr has been shutdown.");
           }
           solrCores.markCoreAsLoading(dcore);
-          if (isZooKeeperAware()) {
-            ParWork.getRootSharedExecutor().submit(new ZkController.RegisterCoreAsync(zkSys.zkController, dcore, false));
-          }
+
           core = new SolrCore(this, dcore, coreConfig);
         } catch (Exception e) {
           core = processCoreCreateException(e, dcore, coreConfig);
@@ -1421,6 +1407,17 @@ public class CoreContainer implements Closeable {
 
         old = registerCore(dcore, core, true);
         registered = true;
+        solrCores.markCoreAsNotLoading(dcore);
+
+        if (isZooKeeperAware()) {
+          if (!newCollection) {
+            if (core.getDirectoryFactory().isSharedStorage()) {
+              zkSys.getZkController().throwErrorIfReplicaReplaced(dcore);
+            }
+          }
+          new ZkController.RegisterCoreAsync(zkSys.zkController, dcore, false).call();
+        }
+
       } catch (Exception e) {
 
         throw new SolrException(ErrorCode.SERVER_ERROR, e);
@@ -2168,7 +2165,7 @@ public class CoreContainer implements Closeable {
   }
 
   public boolean isZooKeeperAware() {
-    return isZkAware && zkSys != null && zkSys.zkController != null;
+    return zkSys != null && zkSys.zkController != null;
   }
 
   public ZkController getZkController() {
diff --git a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
index 6ac1139..07887b3 100644
--- a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
+++ b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
@@ -44,10 +44,8 @@ import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
-import org.apache.solr.common.util.ExecutorUtil;
 import org.apache.solr.common.util.FastInputStream;
 import org.apache.solr.common.util.NamedList;
-import org.apache.solr.common.util.SolrNamedThreadFactory;
 import org.apache.solr.common.util.SuppressForbidden;
 import org.apache.solr.core.DirectoryFactory;
 import org.apache.solr.core.DirectoryFactory.DirContext;
@@ -522,7 +520,7 @@ public class IndexFetcher {
       }
 
       // Create the sync service
-      fsyncService = ExecutorUtil.newMDCAwareSingleThreadExecutor(new SolrNamedThreadFactory("fsyncService"));
+      fsyncService = ParWork.getExecutorService(15);
       // use a synchronized list because the list is read by other threads (to show details)
       filesDownloaded = Collections.synchronizedList(new ArrayList<Map<String, Object>>());
       // if the generation of master is older than that of the slave , it means they are not compatible to be copied
@@ -812,7 +810,6 @@ public class IndexFetcher {
    * terminate the fsync service and wait for all the tasks to complete. If it is already terminated
    */
   private void terminateAndWaitFsyncService() throws Exception {
-    if (fsyncServiceFuture == null || fsyncService.isTerminated()) return;
     fsyncService.shutdown();
     // give a long wait say 1 hr
     fsyncService.awaitTermination(3600, TimeUnit.SECONDS);
@@ -1722,11 +1719,11 @@ public class IndexFetcher {
         throw e;
       } finally {
         cleanup(null);
-        //if cleanup succeeds . The file is downloaded fully. do an fsync
+        //if cleanup succeeds . The file is downloaded fully
         fsyncServiceFuture = fsyncService.submit(() -> {
           try {
-            log.info("Sync and close fetched file", file);
-            file.sync();
+            log.info("Close fetched file", file);
+            file.close();
           } catch (Exception e) {
             fsyncException = e;
           }
diff --git a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
index 63eff4b..2c2dd9c 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
@@ -1189,6 +1189,7 @@ public class RealTimeGetComponent extends SearchComponent
 
     // TODO: get this from cache instead of rebuilding?
     try (UpdateLog.RecentUpdates recentUpdates = ulog.getRecentUpdates()) {
+      log.info("Get updates  versionsRequested={} params={}", versions.size(), params);
       for (Long version : versions) {
         try {
           Object o = recentUpdates.lookup(version);
diff --git a/solr/core/src/java/org/apache/solr/update/AddUpdateCommand.java b/solr/core/src/java/org/apache/solr/update/AddUpdateCommand.java
index e192fe9..8c945f0 100644
--- a/solr/core/src/java/org/apache/solr/update/AddUpdateCommand.java
+++ b/solr/core/src/java/org/apache/solr/update/AddUpdateCommand.java
@@ -43,7 +43,7 @@ public class AddUpdateCommand extends UpdateCommand {
    * Higher level SolrInputDocument, normally used to construct the Lucene Document(s)
    * to index.
    */
-  public SolrInputDocument solrDoc;
+  public volatile SolrInputDocument solrDoc;
 
   /**
    * This is the version of a document, previously indexed, on which the current
@@ -51,7 +51,7 @@ public class AddUpdateCommand extends UpdateCommand {
    * or a full update. A negative value here, e.g. -1, indicates that this add
    * update does not depend on a previous update.
    */
-  public long prevVersion = -1;
+  public volatile long prevVersion = -1;
 
   public boolean overwrite = true;
 
@@ -62,14 +62,14 @@ public class AddUpdateCommand extends UpdateCommand {
 
   public int commitWithin = -1;
 
-  public boolean isLastDocInBatch = false;
+  public volatile boolean isLastDocInBatch = false;
 
   /** Is this a nested update, null means not yet calculated. */
-  public Boolean isNested = null;
+  public volatile Boolean isNested = null;
 
   // optional id in "internal" indexed form... if it is needed and not supplied,
   // it will be obtained from the doc.
-  private BytesRef indexedId;
+  private volatile BytesRef indexedId;
 
   public AddUpdateCommand(SolrQueryRequest req) {
     super(req);
diff --git a/solr/core/src/java/org/apache/solr/update/PeerSync.java b/solr/core/src/java/org/apache/solr/update/PeerSync.java
index 387f27e..d24f905 100644
--- a/solr/core/src/java/org/apache/solr/update/PeerSync.java
+++ b/solr/core/src/java/org/apache/solr/update/PeerSync.java
@@ -441,7 +441,7 @@ public class PeerSync implements SolrMetricProducer {
     Object fingerprint = srsp.getSolrResponse().getResponse().get("fingerprint");
 
     if (log.isInfoEnabled()) {
-      log.info("{} Received {} versions from {} fingerprint:{}", msg(), otherVersions.size(), sreq.shards[0], fingerprint);
+      log.info("{} Received {} versions from {} {} fingerprint:{}", msg(), otherVersions.size(), otherVersions, sreq.shards[0], fingerprint);
     }
     if (fingerprint != null) {
       sreq.fingerprint = IndexFingerprint.fromObject(fingerprint);
@@ -524,7 +524,7 @@ public class PeerSync implements SolrMetricProducer {
 
     SyncShardRequest sreq = (SyncShardRequest) srsp.getShardRequest();
     if (updates.size() < sreq.totalRequestedUpdates) {
-      log.error("{} Requested {} updates from {} but retrieved {}", msg(), sreq.totalRequestedUpdates, sreq.shards[0], updates.size());
+      log.error("{} Requested {} updates from {} but retrieved {} {}", msg(), sreq.totalRequestedUpdates, sreq.shards[0], updates.size(), srsp.getSolrResponse().getResponse());
       return false;
     }
     
@@ -746,7 +746,7 @@ public class PeerSync implements SolrMetricProducer {
       return true;
     }
 
-    MissedUpdatesRequest handleVersionsWithRanges(List<Long> otherVersions, boolean completeList) {
+    static MissedUpdatesRequest handleVersionsWithRanges(List<Long> ourUpdates, List<Long> otherVersions, boolean completeList, long ourLowThreshold) {
       // we may endup asking for updates for too many versions, causing 2MB post payload limit. Construct a range of
       // versions to request instead of asking individual versions
       List<String> rangesToRequest = new ArrayList<>();
@@ -788,9 +788,31 @@ public class PeerSync implements SolrMetricProducer {
       }
 
       String rangesToRequestStr = rangesToRequest.stream().collect(Collectors.joining(","));
+
+      log.info("handleVersionsWithRanges rangesToRequestStr={} otherVersions={} ourVersions={} completeList={} totalRequestedVersions={}", rangesToRequestStr, otherVersions, ourUpdates, completeList, totalRequestedVersions);
+
       return MissedUpdatesRequest.of(rangesToRequestStr, totalRequestedVersions);
     }
 
+    public static void main(String[] args) {
+
+      List<Long> ourUpdates = new ArrayList<>();
+      ourUpdates.add(1689636098592997376l);
+      ourUpdates.add(1689636098591948800l);
+      ourUpdates.add(1689636098531131395l);
+      ourUpdates.add(1689636098531131394l);
+
+//1689636098592997376, 1689636098591948800, 1689636098585657345
+      List<Long> otherVersions = new ArrayList<>();
+      otherVersions.add(1689636098531131395l);
+      otherVersions.add(1689636098531131394l);
+      otherVersions.add(0l);
+//1689636098531131395, 1689636098531131394, 0
+      MissedUpdatesRequest result = handleVersionsWithRanges(ourUpdates, otherVersions, false, 0);
+      System.out.println(result.versionsAndRanges);
+      System.out.println(result.totalRequestedUpdates);
+    }
+
     MissedUpdatesRequest handleIndividualVersions(List<Long> otherVersions, boolean completeList) {
       List<Long> toRequest = new ArrayList<>();
       for (Long otherVersion : otherVersions) {
@@ -867,7 +889,7 @@ public class PeerSync implements SolrMetricProducer {
 
       MissedUpdatesRequest updatesRequest;
       if (canHandleVersionRanges.get()) {
-        updatesRequest = handleVersionsWithRanges(otherVersions, completeList);
+        updatesRequest = handleVersionsWithRanges(ourUpdates, otherVersions, completeList, ourLowThreshold);
       } else {
         updatesRequest = handleIndividualVersions(otherVersions, completeList);
       }
diff --git a/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java b/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
index 4582fc6..b670372 100644
--- a/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
+++ b/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
@@ -244,7 +244,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
 
     MissedUpdatesRequest updatesRequest = missedUpdatesFinder.find(otherVersions, leaderUrl, () -> core.getSolrConfig().useRangeVersionsForPeerSync && canHandleVersionRanges());
     if (updatesRequest == MissedUpdatesRequest.EMPTY) {
-      if (doFingerprint) return MissedUpdatesRequest.UNABLE_TO_SYNC;
+      if (doFingerprint && updatesRequest.totalRequestedUpdates > 0) return MissedUpdatesRequest.UNABLE_TO_SYNC;
       return MissedUpdatesRequest.ALREADY_IN_SYNC;
     }
 
@@ -273,7 +273,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
     List<Object> updates = (List<Object>)rsp.get("updates");
 
     if (updates.size() < numRequestedUpdates) {
-      log.error("{} Requested {} updated from {} but retrieved {}", msg(), numRequestedUpdates, leaderUrl, updates.size());
+      log.error("{} Requested {} updated from {} but retrieved {} {}", msg(), numRequestedUpdates, leaderUrl, updates.size(), rsp);
       return false;
     }
 
@@ -298,13 +298,16 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
         // only DBI or DBQ in the gap (above) will satisfy this predicate
         return version > leaderFingerprint.getMaxVersionEncountered() && (oper == UpdateLog.DELETE || oper == UpdateLog.DELETE_BY_QUERY);
       });
+      log.info("existDBIOrDBQInTheGap={}", existDBIOrDBQInTheGap);
       if (!existDBIOrDBQInTheGap) {
         // it is safe to use leaderFingerprint.maxVersionEncountered as cut point now.
         updates.removeIf(e -> {
           @SuppressWarnings({"unchecked"})
           List<Object> u = (List<Object>) e;
           long version = (Long) u.get(1);
-          return version > leaderFingerprint.getMaxVersionEncountered();
+          boolean success = version > leaderFingerprint.getMaxVersionEncountered();
+          log.info("existDBIOrDBQInTheGap version={}  leaderFingerprint.getMaxVersionEncountered={} success={}", version, leaderFingerprint.getMaxVersionEncountered(), success);
+          return success;
         });
       }
     }
@@ -312,6 +315,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
     try {
       updater.applyUpdates(updates, leaderUrl);
     } catch (Exception e) {
+      log.error("Could not apply updates", e);
       return false;
     }
     return true;
@@ -386,6 +390,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
       if (cmp != 0) {
         if (log.isDebugEnabled()) log.debug("Leader fingerprint: {}, Our fingerprint: {}", leaderFingerprint , ourFingerprint);
       }
+
       return cmp == 0;  // currently, we only check for equality...
     } catch (IOException e) {
       log.warn("Could not confirm if we are already in sync. Continue with PeerSync");
@@ -426,7 +431,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
       boolean completeList = leaderVersions.size() < nUpdates;
       MissedUpdatesRequest updatesRequest;
       if (canHandleVersionRanges.get()) {
-        updatesRequest = handleVersionsWithRanges(leaderVersions, completeList);
+        updatesRequest = handleVersionsWithRanges(ourUpdates, leaderVersions, completeList, ourLowThreshold);
       } else {
         updatesRequest = handleIndividualVersions(leaderVersions, completeList);
       }
diff --git a/solr/core/src/java/org/apache/solr/update/UpdateLog.java b/solr/core/src/java/org/apache/solr/update/UpdateLog.java
index 0f51d41..39fa5de 100644
--- a/solr/core/src/java/org/apache/solr/update/UpdateLog.java
+++ b/solr/core/src/java/org/apache/solr/update/UpdateLog.java
@@ -190,25 +190,25 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
   protected volatile TransactionLog bufferTlog;
   protected volatile TransactionLog tlog;
 
-  protected TransactionLog prevTlog;
+  protected volatile TransactionLog prevTlog;
   protected TransactionLog prevTlogOnPrecommit;
   protected final Deque<TransactionLog> logs = new LinkedList<>();  // list of recent logs, newest first
   protected final LinkedList<TransactionLog> newestLogsOnStartup = new LinkedList<>();
-  protected int numOldRecords;  // number of records in the recent logs
+  protected volatile int numOldRecords;  // number of records in the recent logs
 
   protected volatile Map<BytesRef,LogPtr> map = new ConcurrentHashMap<>(32);
   protected volatile Map<BytesRef,LogPtr> prevMap;  // used while committing/reopening is happening
   protected volatile Map<BytesRef,LogPtr> prevMap2;  // used while committing/reopening is happening
-  protected TransactionLog prevMapLog;  // the transaction log used to look up entries found in prevMap
-  protected TransactionLog prevMapLog2;  // the transaction log used to look up entries found in prevMap2
+  protected volatile TransactionLog prevMapLog;  // the transaction log used to look up entries found in prevMap
+  protected volatile TransactionLog prevMapLog2;  // the transaction log used to look up entries found in prevMap2
 
   protected final int numDeletesToKeep = 1000;
   protected final int numDeletesByQueryToKeep = 100;
-  protected int numRecordsToKeep;
+  protected volatile int numRecordsToKeep;
   protected volatile int maxNumLogsToKeep;
   protected volatile int numVersionBuckets = 65536; // This should only be used to initialize VersionInfo... the actual number of buckets may be rounded up to a power of two.
-  protected Long maxVersionFromIndex = null;
-  protected boolean existOldBufferLog = false;
+  protected volatile Long maxVersionFromIndex = null;
+  protected volatile boolean existOldBufferLog = false;
 
   // keep track of deletes only... this is not updated on an add
   protected Map<BytesRef, LogPtr> oldDeletes = Collections.synchronizedMap(new LinkedHashMap<BytesRef, LogPtr>(numDeletesToKeep) {
@@ -1142,22 +1142,18 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
   // synchronization is needed for stronger guarantees (as VersionUpdateProcessor does).
   public Long lookupVersion(BytesRef indexedId) {
     LogPtr entry;
-    tlogLock.lock();
-    try {
-      entry = map.get(indexedId);
-      // SolrCore.verbose("TLOG: lookup ver: for id ",indexedId.utf8ToString(),"in map",System.identityHashCode(map),"got",entry,"lookupLog=",lookupLog);
-      if (entry == null && prevMap != null) {
-        entry = prevMap.get(indexedId);
-        // something found in prevMap will always be found in prevMapLog (which could be tlog or prevTlog)
-        // SolrCore.verbose("TLOG: lookup ver: for id ",indexedId.utf8ToString(),"in prevMap",System.identityHashCode(map),"got",entry,"lookupLog=",lookupLog);
-      }
-      if (entry == null && prevMap2 != null) {
-        entry = prevMap2.get(indexedId);
-        // something found in prevMap2 will always be found in prevMapLog2 (which could be tlog or prevTlog)
-        // SolrCore.verbose("TLOG: lookup ver: for id ",indexedId.utf8ToString(),"in prevMap2",System.identityHashCode(map),"got",entry,"lookupLog=",lookupLog);
-      }
-    } finally {
-      tlogLock.unlock();
+
+    entry = map.get(indexedId);
+    // SolrCore.verbose("TLOG: lookup ver: for id ",indexedId.utf8ToString(),"in map",System.identityHashCode(map),"got",entry,"lookupLog=",lookupLog);
+    if (entry == null && prevMap != null) {
+      entry = prevMap.get(indexedId);
+      // something found in prevMap will always be found in prevMapLog (which could be tlog or prevTlog)
+      // SolrCore.verbose("TLOG: lookup ver: for id ",indexedId.utf8ToString(),"in prevMap",System.identityHashCode(map),"got",entry,"lookupLog=",lookupLog);
+    }
+    if (entry == null && prevMap2 != null) {
+      entry = prevMap2.get(indexedId);
+      // something found in prevMap2 will always be found in prevMapLog2 (which could be tlog or prevTlog)
+      // SolrCore.verbose("TLOG: lookup ver: for id ",indexedId.utf8ToString(),"in prevMap2",System.identityHashCode(map),"got",entry,"lookupLog=",lookupLog);
     }
 
 
@@ -1174,12 +1170,8 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
 
     // We can't get any version info for deletes from the index, so if the doc
     // wasn't found, check a cache of recent deletes.
-    tlogLock.lock();
-    try {
-      entry = oldDeletes.get(indexedId);
-    } finally {
-      tlogLock.unlock();
-    }
+
+    entry = oldDeletes.get(indexedId);
 
     if (entry != null) {
       return entry.version;
@@ -1195,15 +1187,10 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
     if (syncLevel == SyncLevel.NONE) {
       return;
     }
-    TransactionLog currLog;
-    tlogLock.lock();
-    try {
-      currLog = tlog;
-      if (currLog == null) return;
-      currLog.incref();
-    } finally {
-      tlogLock.unlock();
-    }
+
+    TransactionLog currLog = tlog;
+    if (currLog == null) return;
+    currLog.incref();
 
     try {
       currLog.finish(syncLevel);
@@ -1327,20 +1314,16 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
   public void commitAndSwitchToNewTlog(CommitUpdateCommand cuc) {
     versionInfo.blockUpdates();
     try {
-      tlogLock.lock();
+      if (tlog == null) {
+        return;
+      }
+      preCommit(cuc);
       try {
-        if (tlog == null) {
-          return;
-        }
-        preCommit(cuc);
-        try {
-          copyOverOldUpdates(cuc.getVersion());
-        } finally {
-          postCommit(cuc);
-        }
+        copyOverOldUpdates(cuc.getVersion());
       } finally {
-        tlogLock.unlock();
+        postCommit(cuc);
       }
+
     } finally {
       versionInfo.unblockUpdates();
     }
@@ -1611,19 +1594,29 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
       for (List<Update> singleList : updateList) {
         for (Update ptr : singleList) {
           if(Math.abs(ptr.version) > Math.abs(maxVersion)) continue;
-          ret.add(ptr.version);
+          if (ptr.version != 0) {
+            ret.add(ptr.version);
+          }
           if (--n <= 0) return ret;
         }
       }
+      log.info("Return getVersions {} {}", n, ret);
 
       return ret;
     }
 
     public Object lookup(long version) {
+      log.info("lookup {}", version);
       Update update = updates.get(version);
       if (update == null) return null;
 
-      return update.log.lookup(update.pointer);
+      log.info("found update from updates {} {}", update.version, updates.size());
+
+      Object object = update.log.lookup(update.pointer);
+
+      log.info("found update from log {} {} ptr={} object={}", update.version, update.log, update.pointer, object);
+
+      return object;
     }
 
     /** Returns the list of deleteByQueries that happened after the given version */
@@ -1640,7 +1633,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
 
     private void update() {
       int numUpdates = 0;
-      updateList = new ArrayList<>(logList.size());
+      updateList = new ArrayList<>(numRecordsToKeep);
       deleteByQueryList = new ArrayList<>();
       deleteList = new ArrayList<>();
       updates = new HashMap<>(numRecordsToKeep);
@@ -1705,12 +1698,14 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
               // would be caused by a corrupt transaction log
             } catch (Exception ex) {
               log.warn("Exception reverse reading log", ex);
-              break;
+             // break;
             }
 
             numUpdates++;
           }
 
+          log.info("Recent updates updates numUpdates={} numUpdatesToKeep={}", numUpdates, numRecordsToKeep);
+
         } catch (IOException | AssertionError e) { // catch AssertionError to handle certain test failures correctly
           // failure to read a log record isn't fatal
           log.error("Exception reading versions from log",e);
@@ -1744,6 +1739,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
   public RecentUpdates getRecentUpdates() {
     Deque<TransactionLog> logList;
     tlogLock.lock();
+    RecentUpdates recentUpdates;
     try {
       logList = new LinkedList<>(logs);
       for (TransactionLog log : logList) {
@@ -1761,14 +1757,15 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
         bufferTlog.incref();
         logList.addFirst(bufferTlog);
       }
+
+      recentUpdates = new RecentUpdates(logList, numRecordsToKeep);
     } finally {
       tlogLock.unlock();
     }
 
     // TODO: what if I hand out a list of updates, then do an update, then hand out another list (and
     // one of the updates I originally handed out fell off the list).  Over-request?
-    return new RecentUpdates(logList, numRecordsToKeep);
-
+    return recentUpdates;
   }
 
   public void bufferUpdates() {
diff --git a/solr/core/src/java/org/apache/solr/update/UpdateShardHandler.java b/solr/core/src/java/org/apache/solr/update/UpdateShardHandler.java
index 0f7800e..ecdf3b6 100644
--- a/solr/core/src/java/org/apache/solr/update/UpdateShardHandler.java
+++ b/solr/core/src/java/org/apache/solr/update/UpdateShardHandler.java
@@ -108,7 +108,7 @@ public class UpdateShardHandler implements SolrInfoBean {
           .connectionTimeout(cfg.getDistributedConnectionTimeout())
           .idleTimeout(cfg.getDistributedSocketTimeout());
     }
-    updateOnlyClient = updateOnlyClientBuilder.markInternalRequest().strictEventOrdering(false).build();
+    updateOnlyClient = updateOnlyClientBuilder.markInternalRequest().strictEventOrdering(true).build();
     updateOnlyClient.enableCloseLock();
    // updateOnlyClient.addListenerFactory(updateHttpListenerFactory);
     Set<String> queryParams = new HashSet<>(2);
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIDistributedZkTest.java
index 4e7b03d..b044358 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIDistributedZkTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/CollectionsAPIDistributedZkTest.java
@@ -289,6 +289,7 @@ public class CollectionsAPIDistributedZkTest extends SolrCloudTestCase {
   }
 
   @Test
+  @Ignore
   public void testDeleteNonExistentCollection() throws Exception {
 
     expectThrows(Exception.class, () -> {
diff --git a/solr/solrj/src/java/org/apache/solr/common/SolrInputDocument.java b/solr/solrj/src/java/org/apache/solr/common/SolrInputDocument.java
index 79305a5..4a457d0 100644
--- a/solr/solrj/src/java/org/apache/solr/common/SolrInputDocument.java
+++ b/solr/solrj/src/java/org/apache/solr/common/SolrInputDocument.java
@@ -19,6 +19,7 @@ package org.apache.solr.common;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
@@ -42,7 +43,7 @@ public class SolrInputDocument extends SolrDocumentBase<SolrInputField, SolrInpu
   private List<SolrInputDocument> _childDocuments;
 
   public SolrInputDocument(String... fields) {
-    _fields = new LinkedHashMap<>();
+    _fields = Collections.synchronizedMap(new LinkedHashMap<>());
     assert fields.length % 2 == 0;
     for (int i = 0; i < fields.length; i += 2) {
       addField(fields[i], fields[i + 1]);
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZooKeeper.java b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZooKeeper.java
index 0546b05..9338679 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZooKeeper.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZooKeeper.java
@@ -104,22 +104,26 @@ public class SolrZooKeeper extends ZooKeeper {
   @Override
   public void close() {
     if (closeTracker != null) closeTracker.close();
+//    try {
+//      try {
+//        RequestHeader h = new RequestHeader();
+//        h.setType(ZooDefs.OpCode.closeSession);
+//
+//        cnxn.submitRequest(h, null, null, null);
+//      } catch (InterruptedException e) {
+//        // ignore, close the send/event threads
+//      } finally {
+//        ZooKeeperExposed zk = new ZooKeeperExposed(this, cnxn);
+//        zk.closeCnxn();
+//      }
+//    } catch (Exception e) {
+//      log.warn("Exception closing zookeeper client", e);
+//    }
     try {
-      try {
-        RequestHeader h = new RequestHeader();
-        h.setType(ZooDefs.OpCode.closeSession);
-
-        cnxn.submitRequest(h, null, null, null);
-      } catch (InterruptedException e) {
-        // ignore, close the send/event threads
-      } finally {
-        ZooKeeperExposed zk = new ZooKeeperExposed(this, cnxn);
-        zk.closeCnxn();
-      }
-    } catch (Exception e) {
-      log.warn("Exception closing zookeeper client", e);
+      super.close();
+    } catch (InterruptedException e) {
+      e.printStackTrace();
     }
-
   }
 
 }
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
index 48c3329..0e3a685 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
@@ -994,13 +994,13 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       Slice slice = coll.getSlice(shard);
       if (slice != null) {
         Replica leader = slice.getLeader();
-        boolean valid;
+        boolean valid = false;
         try {
-          valid = mustBeLive ? isNodeLive(leader.getNodeName()) || zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + slice.getName() + "/leader") : isNodeLive(leader.getNodeName());
+          valid = mustBeLive ? leader != null && isNodeLive(leader.getNodeName()) || zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + slice.getName() + "/leader") : leader != null && isNodeLive(leader.getNodeName());
         } catch (KeeperException e) {
-          throw new SolrException(ErrorCode.SERVER_ERROR, e);
+
         } catch (InterruptedException e) {
-          throw new SolrException(ErrorCode.SERVER_ERROR, e);
+
         }
         if (leader != null && leader.getState() == Replica.State.ACTIVE && valid) {
           return leader;
@@ -1022,36 +1022,25 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
         Slice slice = c.getSlice(shard);
         if (slice == null) return false;
         Replica leader = slice.getLeader();
-
-        if (leader != null && leader.getState() == Replica.State.ACTIVE) {
-          boolean valid = false;
-          try {
-            valid = mustBeLive ?  isNodeLive(leader.getNodeName()) || zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + slice.getName()  + "/leader") : isNodeLive(leader.getNodeName());
-          } catch (KeeperException e) {
-            throw new SolrException(ErrorCode.SERVER_ERROR, e);
-          } catch (InterruptedException e) {
-            throw new SolrException(ErrorCode.SERVER_ERROR, e);
-          }
-          if (valid) {
-            returnLeader.set(leader);
-            return true;
-          }
+        boolean valid = false;
+        try {
+          valid = mustBeLive ?  (leader != null && isNodeLive(leader.getNodeName())) ||
+              zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + slice.getName()  + "/leader") :
+              leader != null && isNodeLive(leader.getNodeName());
+        } catch (KeeperException e) {
+          return false;
+        } catch (InterruptedException e) {
+          return false;
+        }
+        if (leader != null && leader.getState() == Replica.State.ACTIVE && valid) {
+          returnLeader.set(leader);
+          return true;
         }
         Collection<Replica> replicas = slice.getReplicas();
         for (Replica replica : replicas) {
-          if ("true".equals(replica.getProperty(LEADER_PROP)) && replica.getState() == Replica.State.ACTIVE) {
-            boolean valid = false;
-            try {
-              valid = mustBeLive ?  zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + slice.getName()  + "/leader") : isNodeLive(leader.getNodeName());
-            } catch (KeeperException e) {
-              throw new SolrException(ErrorCode.SERVER_ERROR, e);
-            } catch (InterruptedException e) {
-              throw new SolrException(ErrorCode.SERVER_ERROR, e);
-            }
-            if (valid) {
-              returnLeader.set(replica);
-              return true;
-            }
+          if ("true".equals(replica.getProperty(LEADER_PROP)) && replica.getState() == Replica.State.ACTIVE && valid) {
+            returnLeader.set(replica);
+            return true;
           }
         }
 
@@ -1991,9 +1980,10 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
     registerDocCollectionWatcher(collection, wrapper);
     registerLiveNodesListener(wrapper);
 
-//    DocCollection state = clusterState.getCollectionOrNull(collection);
-//
-//    removeCollectionStateWatcher(collection, stateWatcher);
+    DocCollection state = clusterState.getCollectionOrNull(collection);
+    if (stateWatcher.onStateChanged(liveNodes, state) == true) {
+      removeCollectionStateWatcher(collection, stateWatcher);
+    }
   }
 
   /**


[lucene-solr] 12/16: @1288 Only check state in prep recovery and don't try to unload cores that look to have moved.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 34cc96692eec95518146dbd4fec5f879c9ee1b7a
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jan 21 23:08:21 2021 -0600

    @1288 Only check state in prep recovery and don't try to unload cores that look to have moved.
---
 .../src/java/org/apache/solr/cloud/CloudUtil.java  |  19 ---
 .../java/org/apache/solr/core/CoreContainer.java   | 183 ++++++++++-----------
 .../apache/solr/handler/admin/PrepRecoveryOp.java  |   2 +-
 3 files changed, 86 insertions(+), 118 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/CloudUtil.java b/solr/core/src/java/org/apache/solr/cloud/CloudUtil.java
index a0f6c51..547d7484 100644
--- a/solr/core/src/java/org/apache/solr/cloud/CloudUtil.java
+++ b/solr/core/src/java/org/apache/solr/cloud/CloudUtil.java
@@ -105,25 +105,6 @@ public class CloudUtil {
     }
   }
 
-  public static boolean checkIfValidCloudCore(CoreContainer cc, CoreDescriptor desc) {
-    if (desc.getCloudDescriptor() == null) return false;
-    ZkController zkController = cc.getZkController();
-    String coreName = desc.getName();
-
-    // if we see our core node name on a different base url, unload
-    final DocCollection docCollection = zkController.getClusterState().getCollectionOrNull(desc.getCloudDescriptor().getCollectionName());
-    if (docCollection == null || docCollection.getReplica(coreName) == null) {
-
-      try {
-        cc.unload(desc.getName(), true, true, true);
-      } catch (Exception e) {
-        log.error("unload exception", e);
-      }
-      return false;
-    }
-    return true;
-  }
-
   /**
    * Returns a displayable unified path to the given resource. For non-solrCloud that will be the
    * same as getConfigDir, but for Cloud it will be getConfigSetZkPath ending in a /
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index 54248c2..ab33eae 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -28,7 +28,6 @@ import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.store.Directory;
 import org.apache.solr.client.solrj.SolrClient;
 import org.apache.solr.client.solrj.cloud.SolrCloudManager;
-import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
 import org.apache.solr.client.solrj.impl.CloudHttp2SolrClient;
 import org.apache.solr.client.solrj.impl.HttpClientUtil;
 import org.apache.solr.client.solrj.impl.SolrHttpClientBuilder;
@@ -36,7 +35,6 @@ import org.apache.solr.client.solrj.impl.SolrHttpClientContextBuilder;
 import org.apache.solr.client.solrj.io.SolrClientCache;
 import org.apache.solr.client.solrj.util.SolrIdentifierValidator;
 import org.apache.solr.cloud.CloudDescriptor;
-import org.apache.solr.cloud.CloudUtil;
 import org.apache.solr.cloud.Overseer;
 import org.apache.solr.cloud.ZkController;
 import org.apache.solr.cloud.overseer.OverseerAction;
@@ -729,31 +727,29 @@ public class CoreContainer implements Closeable {
 
     try {
 
-        solrCores.load(loader);
+      solrCores.load(loader);
 
-        logging = LogWatcher.newRegisteredLogWatcher(cfg.getLogWatcherConfig(), loader);
+      logging = LogWatcher.newRegisteredLogWatcher(cfg.getLogWatcherConfig(), loader);
 
-        hostName = cfg.getNodeName();
+      hostName = cfg.getNodeName();
 
-        collectionsHandler = createHandler(COLLECTIONS_HANDLER_PATH, cfg.getCollectionsHandlerClass(), CollectionsHandler.class);
-        infoHandler = createHandler(INFO_HANDLER_PATH, cfg.getInfoHandlerClass(), InfoHandler.class);
-        coreAdminHandler = createHandler(CORES_HANDLER_PATH, cfg.getCoreAdminHandlerClass(), CoreAdminHandler.class);
-        configSetsHandler = createHandler(CONFIGSETS_HANDLER_PATH, cfg.getConfigSetsHandlerClass(), ConfigSetsHandler.class);
+      collectionsHandler = createHandler(COLLECTIONS_HANDLER_PATH, cfg.getCollectionsHandlerClass(), CollectionsHandler.class);
+      infoHandler = createHandler(INFO_HANDLER_PATH, cfg.getInfoHandlerClass(), InfoHandler.class);
+      coreAdminHandler = createHandler(CORES_HANDLER_PATH, cfg.getCoreAdminHandlerClass(), CoreAdminHandler.class);
+      configSetsHandler = createHandler(CONFIGSETS_HANDLER_PATH, cfg.getConfigSetsHandlerClass(), ConfigSetsHandler.class);
 
-        createHandler(ZK_PATH, ZookeeperInfoHandler.class.getName(), ZookeeperInfoHandler.class);
-        createHandler(ZK_STATUS_PATH, ZookeeperStatusHandler.class.getName(), ZookeeperStatusHandler.class);
+      createHandler(ZK_PATH, ZookeeperInfoHandler.class.getName(), ZookeeperInfoHandler.class);
+      createHandler(ZK_STATUS_PATH, ZookeeperStatusHandler.class.getName(), ZookeeperStatusHandler.class);
 
-
-        if (isZooKeeperAware()) {
-          try {
-            zkSys.start(this);
-          } catch (IOException e) {
-            throw new SolrException(ErrorCode.SERVER_ERROR, e);
-          } catch (KeeperException e) {
-            throw new SolrException(ErrorCode.SERVER_ERROR, e);
-          }
+      if (isZooKeeperAware()) {
+        try {
+          zkSys.start(this);
+        } catch (IOException e) {
+          throw new SolrException(ErrorCode.SERVER_ERROR, e);
+        } catch (KeeperException e) {
+          throw new SolrException(ErrorCode.SERVER_ERROR, e);
         }
-
+      }
 
       try (ParWork work = new ParWork(this, false, true)) {
 
@@ -807,7 +803,6 @@ public class CoreContainer implements Closeable {
 
         work.addCollect();
 
-
         work.collect("", () -> {
           solrClientCache = new SolrClientCache(isZooKeeperAware() ? zkSys.getZkController().getZkStateReader() : null, updateShardHandler.getTheSharedHttpClient());
         });
@@ -833,7 +828,6 @@ public class CoreContainer implements Closeable {
       throw new SolrException(ErrorCode.SERVER_ERROR, "Exception in CoreContainer load", e);
     }
 
-
     if (!containerHandlers.keySet().contains(CORES_HANDLER_PATH)) {
       throw new IllegalStateException("No core admin path was loaded " + CORES_HANDLER_PATH);
     }
@@ -877,98 +871,91 @@ public class CoreContainer implements Closeable {
     }
 
     List<Future<SolrCore>> coreLoadFutures = null;
-    try {
-      List<CoreDescriptor> cds = coresLocator.discover(this);
-      coreLoadFutures = new ArrayList<>(cds.size());
-      if (isZooKeeperAware()) {
-        cds = CoreSorter.sortCores(this, cds);
+
+    List<CoreDescriptor> cds = coresLocator.discover(this);
+    coreLoadFutures = new ArrayList<>(cds.size());
+    if (isZooKeeperAware()) {
+      cds = CoreSorter.sortCores(this, cds);
+    }
+    checkForDuplicateCoreNames(cds);
+    status |= CORE_DISCOVERY_COMPLETE;
+    startedLoadingCores = true;
+    for (final CoreDescriptor cd : cds) {
+//      if (isZooKeeperAware()) {
+//        String collection = cd.getCollectionName();
+//        try {
+//          zkSys.zkController.zkStateReader.waitForState(collection, 5, TimeUnit.SECONDS, (n, c) -> {
+//            if (c != null) {
+//              Replica replica = c.getReplica(cd.getName());
+//
+//              if (replica.getState().equals(State.DOWN)) {
+//                return true;
+//              }
+//
+//            }
+//            return false;
+//          });
+//        } catch (InterruptedException e) {
+//          ParWork.propagateInterrupt(e);
+//        } catch (TimeoutException e) {
+//          log.error("Timeout", e);
+//        }
+//      }
+
+      if (log.isDebugEnabled()) log.debug("Process core descriptor {} {} {}", cd.getName(), cd.isTransient(), cd.isLoadOnStartup());
+      if (cd.isTransient() || !cd.isLoadOnStartup()) {
+        solrCores.addCoreDescriptor(cd);
+      } else {
+        solrCores.markCoreAsLoading(cd);
       }
-      checkForDuplicateCoreNames(cds);
-      status |= CORE_DISCOVERY_COMPLETE;
+      if (cd.isLoadOnStartup()) {
 
-      for (final CoreDescriptor cd : cds) {
-        if (isZooKeeperAware()) {
-          String collection = cd.getCollectionName();
+        coreLoadFutures.add(solrCoreLoadExecutor.submit(() -> {
+          SolrCore core;
+          MDCLoggingContext.setCoreDescriptor(this, cd);
           try {
-            zkSys.zkController.zkStateReader.waitForState(collection, 5, TimeUnit.SECONDS, (n, c) -> {
-              if (c != null) {
-                Replica replica = c.getReplica(cd.getName());
-
-                if (replica.getState().equals(State.DOWN)) {
-                  return true;
-                }
-
-              }
-              return false;
-            });
-          } catch (InterruptedException e) {
-            ParWork.propagateInterrupt(e);
-          } catch (TimeoutException e) {
-            log.error("Timeout", e);
-          }
-        }
-
-        if (log.isDebugEnabled()) log.debug("Process core descriptor {} {} {}", cd.getName(), cd.isTransient(), cd.isLoadOnStartup());
-        if (cd.isTransient() || !cd.isLoadOnStartup()) {
-          solrCores.addCoreDescriptor(cd);
-        } else {
-          solrCores.markCoreAsLoading(cd);
-        }
-        if (cd.isLoadOnStartup()) {
-          if (isZooKeeperAware() && !CloudUtil.checkIfValidCloudCore(this, cd)) {
-            continue;
-          }
-
-          coreLoadFutures.add(solrCoreLoadExecutor.submit(() -> {
-            SolrCore core;
-            MDCLoggingContext.setCoreDescriptor(this, cd);
             try {
-              try {
 
-                core = createFromDescriptor(cd, false);
+              core = createFromDescriptor(cd, false);
 
-                if (core.getDirectoryFactory().isSharedStorage()) {
-                  if (isZooKeeperAware()) {
-                    zkSys.getZkController().throwErrorIfReplicaReplaced(cd);
-                  }
+              if (core.getDirectoryFactory().isSharedStorage()) {
+                if (isZooKeeperAware()) {
+                  zkSys.getZkController().throwErrorIfReplicaReplaced(cd);
                 }
-
-              } finally {
-                solrCores.markCoreAsNotLoading(cd);
-              } if (isZooKeeperAware()) {
-                new ZkController.RegisterCoreAsync(zkSys.zkController, cd, false).call();
               }
+
             } finally {
-              MDCLoggingContext.clear();
+              solrCores.markCoreAsNotLoading(cd);
             }
-            return core;
-          }));
-        }
-      }
-      if (isZooKeeperAware()) {
-
-        ParWork.getRootSharedExecutor().submit(() -> {
-          zkSys.getZkController().createEphemeralLiveNode();
-        });
+            if (isZooKeeperAware()) {
+              new ZkController.RegisterCoreAsync(zkSys.zkController, cd, false).call();
+            }
+          } finally {
+            MDCLoggingContext.clear();
+          }
+          return core;
+        }));
       }
-    } finally {
+    }
+    if (isZooKeeperAware()) {
+      zkSys.getZkController().createEphemeralLiveNode();
+    }
 
-      startedLoadingCores = true;
-      if (coreLoadFutures != null && !asyncSolrCoreLoad) {
-        for (Future<SolrCore> future : coreLoadFutures) {
-          try {
-            future.get();
-          } catch (InterruptedException e) {
-            ParWork.propagateInterrupt(e);
-          } catch (ExecutionException e) {
-            log.error("Error waiting for SolrCore to be loaded on startup", e.getCause());
-          }
+    if (coreLoadFutures != null && !asyncSolrCoreLoad) {
+      for (Future<SolrCore> future : coreLoadFutures) {
+        try {
+          future.get();
+        } catch (InterruptedException e) {
+          ParWork.propagateInterrupt(e);
+        } catch (ExecutionException e) {
+          log.error("Error waiting for SolrCore to be loaded on startup", e.getCause());
         }
       }
     }
+
     if (isZooKeeperAware()) {
 
-     // zkSys.getZkController().checkOverseerDesignate();
+      // zkSys.getZkController().checkOverseerDesignate();
       // initialize this handler here when SolrCloudManager is ready
     }
     // This is a bit redundant but these are two distinct concepts for all they're accomplished at the same time.
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
index b463137..ca07d0b 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
@@ -79,7 +79,7 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
         boolean isLive = false;
         if (replica != null) {
           isLive = coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName());
-          if ((replica.getState() == waitForState || replica.getState() == Replica.State.ACTIVE) && isLive) {
+          if (replica.getState() == waitForState) {
             // if (log.isDebugEnabled()) log.debug("replica={} state={} waitForState={}", replica, replica.getState(), waitForState);
             log.info("replica={} state={} waitForState={} isLive={}", replica, replica.getState(), waitForState, coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName()));
             return true;


[lucene-solr] 16/16: wip

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 4a4e36a2251864672ffa372a045856fd9b46c155
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Sat Jan 23 20:17:40 2021 -0600

    wip
---
 .../apache/lucene/store/NRTCachingDirectory.java   |   4 +-
 solr/bin/solr                                      |   2 +-
 .../java/org/apache/solr/cloud/LeaderElector.java  |   2 +
 .../src/java/org/apache/solr/cloud/Overseer.java   |  27 +--
 .../org/apache/solr/cloud/OverseerTaskQueue.java   |   2 +
 .../org/apache/solr/cloud/RecoveryStrategy.java    |  35 ++--
 .../java/org/apache/solr/cloud/ZkController.java   |  49 +----
 .../java/org/apache/solr/cloud/ZkShardTerms.java   |  42 ++--
 .../apache/solr/cloud/overseer/ZkStateWriter.java  | 218 ++++++++++++---------
 .../apache/solr/core/CachingDirectoryFactory.java  |   2 +-
 .../java/org/apache/solr/core/CoreContainer.java   |  97 ++++-----
 .../apache/solr/core/CorePropertiesLocator.java    |   4 +-
 .../src/java/org/apache/solr/core/SolrCore.java    |  24 +--
 .../src/java/org/apache/solr/core/SolrCores.java   |   2 +-
 .../java/org/apache/solr/handler/IndexFetcher.java |  15 +-
 .../solr/handler/admin/CoreAdminOperation.java     |   8 +-
 .../apache/solr/handler/admin/PrepRecoveryOp.java  |   4 +-
 .../handler/component/RealTimeGetComponent.java    |   6 +-
 .../apache/solr/schema/ZkIndexSchemaReader.java    |   2 +
 .../org/apache/solr/servlet/SolrQoSFilter.java     |  86 ++++----
 .../apache/solr/update/DefaultSolrCoreState.java   |   4 +-
 .../src/java/org/apache/solr/update/PeerSync.java  |  74 +++----
 .../org/apache/solr/update/PeerSyncWithLeader.java |   9 +-
 .../java/org/apache/solr/update/SolrCoreState.java |   4 +-
 .../org/apache/solr/update/SolrIndexSplitter.java  |   2 +-
 .../org/apache/solr/update/SolrIndexWriter.java    |   8 +-
 .../src/java/org/apache/solr/update/UpdateLog.java |  12 +-
 .../org/apache/solr/update/UpdateShardHandler.java |   2 +-
 .../processor/DistributedZkUpdateProcessor.java    |   8 +-
 .../solr/cloud/MissingSegmentRecoveryTest.java     |   2 +
 solr/server/etc/jetty-https.xml                    |   2 +-
 solr/server/etc/jetty-https8.xml                   |   5 +
 solr/server/etc/jetty.xml                          |   4 -
 solr/server/resources/log4j2.xml                   |   4 +-
 .../solr/client/solrj/impl/Http2SolrClient.java    |   8 +-
 .../src/java/org/apache/solr/common/ParWork.java   |  16 +-
 .../apache/solr/common/PerThreadExecService.java   |  20 +-
 .../apache/solr/common/cloud/SolrZooKeeper.java    |  36 ++--
 .../apache/solr/common/cloud/ZkStateReader.java    |  96 +++++----
 .../java/org/apache/solr/common/util/SysStats.java |  23 ++-
 .../org/apache/zookeeper/ZooKeeperExposed.java     |   2 +-
 41 files changed, 474 insertions(+), 498 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java b/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java
index ec7e5b9..fee7f51 100644
--- a/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java
+++ b/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java
@@ -212,7 +212,7 @@ public class NRTCachingDirectory extends FilterDirectory implements Accountable
     // it for defensive reasons... or in case the app is
     // doing something custom (creating outputs directly w/o
     // using IndexWriter):
-    if (Boolean.getBoolean("solr.nrtDirSync")) { // nocommit)
+  //  if (Boolean.getBoolean("solr.nrtDirSync")) { // nocommit)
       IOUtils.close(() -> {
         if (!closed.getAndSet(true)) {
           for (String fileName : cacheDirectory.listAll()) {
@@ -220,7 +220,7 @@ public class NRTCachingDirectory extends FilterDirectory implements Accountable
           }
         }
       }, cacheDirectory, in);
-    }
+  //  }
   }
 
   /** Subclass can override this to customize logic; return
diff --git a/solr/bin/solr b/solr/bin/solr
index 3216d78..49cfa83 100755
--- a/solr/bin/solr
+++ b/solr/bin/solr
@@ -1904,7 +1904,7 @@ if [ -z ${GC_LOG_OPTS+x} ]; then
                  '-XX:+PrintGCDateStamps' '-XX:+PrintGCTimeStamps' '-XX:+PrintTenuringDistribution' \
                  '-XX:+PrintGCApplicationStoppedTime')
   else
-    GC_LOG_OPTS=('-Xlog:gc*')
+    GC_LOG_OPTS=('')
   fi
 else
   GC_LOG_OPTS=($GC_LOG_OPTS)
diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
index 21b2a41..184356c 100644
--- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
+++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
@@ -573,6 +573,8 @@ public class LeaderElector implements Closeable {
       SolrZooKeeper zk = zkClient.getSolrZooKeeper();
       try {
         zk.removeWatches(watchedNode, this, WatcherType.Any, true);
+      } catch (KeeperException.NoWatcherException e) {
+
       } catch (Exception e) {
         log.info("could not remove watch {} {}", e.getClass().getSimpleName(), e.getMessage());
       }
diff --git a/solr/core/src/java/org/apache/solr/cloud/Overseer.java b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
index 6dac590..754a970 100644
--- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java
+++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
@@ -30,6 +30,7 @@ import org.apache.solr.cloud.overseer.OverseerAction;
 import org.apache.solr.cloud.overseer.ZkStateWriter;
 import org.apache.solr.common.AlreadyClosedException;
 import org.apache.solr.common.ParWork;
+import org.apache.solr.common.ParWorkExecutor;
 import org.apache.solr.common.SolrCloseable;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrThread;
@@ -45,6 +46,7 @@ import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.ObjectReleaseTracker;
 import org.apache.solr.common.util.Pair;
+import org.apache.solr.common.util.SysStats;
 import org.apache.solr.core.CloudConfig;
 import org.apache.solr.core.CoreContainer;
 import org.apache.solr.handler.admin.CollectionsHandler;
@@ -53,7 +55,6 @@ import org.apache.solr.update.UpdateShardHandler;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
-import org.apache.zookeeper.data.Stat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -71,7 +72,8 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
-import java.util.concurrent.SynchronousQueue;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.locks.ReentrantLock;
 import java.util.function.BiConsumer;
 
@@ -158,7 +160,7 @@ public class Overseer implements SolrCloseable {
   private volatile boolean initedHttpClient = false;
   private volatile QueueWatcher queueWatcher;
   private volatile WorkQueueWatcher.CollectionWorkQueueWatcher collectionQueueWatcher;
-  private volatile ExecutorService taskExecutor;
+  private volatile ParWorkExecutor taskExecutor;
 
   public boolean isDone() {
     return closeAndDone;
@@ -285,14 +287,10 @@ public class Overseer implements SolrCloseable {
 //
 //     stateManagmentExecutor = ParWork.getParExecutorService("stateManagmentExecutor",
 //        1, 1, 3000, new SynchronousQueue());
-     taskExecutor = ParWork.getParExecutorService("overseerTaskExecutor",
-        3, 32, 1000, new SynchronousQueue());
+     taskExecutor = (ParWorkExecutor) ParWork.getParExecutorService("overseerTaskExecutor",
+         SysStats.PROC_COUNT, SysStats.PROC_COUNT * 2, 3000, new LinkedBlockingQueue<>(1024));
+     taskExecutor.prestartAllCoreThreads();
 
-//    try {
-//      if (context != null) context.close();
-//    } catch (Exception e) {
-//      log.error("", e);
-//    }
     if (overseerOnlyClient == null && !closeAndDone && !initedHttpClient) {
       overseerOnlyClient = new Http2SolrClient.Builder().idleTimeout(60000).connectionTimeout(5000).markInternalRequest().build();
       overseerOnlyClient.enableCloseLock();
@@ -336,7 +334,7 @@ public class Overseer implements SolrCloseable {
     ThreadGroup ccTg = new ThreadGroup("Overseer collection creation process.");
 
 
-    this.zkStateWriter = new ZkStateWriter(zkController.getZkStateReader(), stats);
+    this.zkStateWriter = new ZkStateWriter(zkController.getZkStateReader(), stats, this);
     //systemCollectionCompatCheck(new StringBiConsumer());
 
     queueWatcher = new WorkQueueWatcher(getCoreContainer());
@@ -538,7 +536,10 @@ public class Overseer implements SolrCloseable {
       }
 
       if (taskExecutor != null) {
-        taskExecutor.shutdownNow();
+        try {
+          taskExecutor.awaitTermination(5000, TimeUnit.SECONDS);
+        } catch (InterruptedException e) {
+        }
       }
 
     }
@@ -813,6 +814,8 @@ public class Overseer implements SolrCloseable {
         this.closed = true;
         try {
           zkController.getZkClient().getSolrZooKeeper().removeWatches(path, this, WatcherType.Data, true);
+        } catch (KeeperException.NoWatcherException e) {
+
         } catch (Exception e) {
           log.info("could not remove watch {} {}", e.getClass().getSimpleName(), e.getMessage());
         }
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskQueue.java b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskQueue.java
index 778336a..5eca2f4 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskQueue.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskQueue.java
@@ -226,6 +226,8 @@ public class OverseerTaskQueue extends ZkDistributedQueue {
       this.closed = true;
       try {
         zkClient.getSolrZooKeeper().removeWatches(path, this, WatcherType.Data, true);
+      }  catch (KeeperException.NoWatcherException e) {
+
       } catch (Exception e) {
         log.info("could not remove watch {} {}", e.getClass().getSimpleName(), e.getMessage());
       }
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index ee085b2..cce536d 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -224,15 +224,14 @@ public class RecoveryStrategy implements Runnable, Closeable {
     return leaderprops.getCoreUrl();
   }
 
-  final private IndexFetcher.IndexFetchResult replicate(Replica leaderprops)
+  final private IndexFetcher.IndexFetchResult replicate(Replica leader)
       throws SolrServerException, IOException {
 
-    log.info("Attempting to replicate from [{}].", leaderprops);
+    log.info("Attempting to replicate from [{}].", leader);
 
     String leaderUrl;
     // send commit
     try {
-      Replica leader = zkController.getZkStateReader().getLeaderRetry(coreDescriptor.getCollectionName(), coreDescriptor.getCloudDescriptor().getShardId(), 1500, false);
       leaderUrl = leader.getCoreUrl();
       commitOnLeader(leaderUrl);
     } catch (Exception e) {
@@ -350,7 +349,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
           // expected
         }
 
-        Replica leader = zkController.getZkStateReader().getLeaderRetry(coreDescriptor.getCollectionName(), coreDescriptor.getCloudDescriptor().getShardId(), 1500, false);
+        Replica leader = zkController.getZkStateReader().getLeaderRetry(coreDescriptor.getCollectionName(), coreDescriptor.getCloudDescriptor().getShardId(), 3000, false);
 
         if (leader != null && leader.getName().equals(coreName)) {
           log.info("We are the leader, STOP recovery");
@@ -365,10 +364,10 @@ public class RecoveryStrategy implements Runnable, Closeable {
         boolean successfulRecovery;
         if (this.coreDescriptor.getCloudDescriptor().requiresTransactionLog()) {
           if (log.isDebugEnabled()) log.debug("Sync or replica recovery");
-          successfulRecovery = doSyncOrReplicateRecovery(core);
+          successfulRecovery = doSyncOrReplicateRecovery(core, leader);
         } else {
           if (log.isDebugEnabled()) log.debug("Replicate only recovery");
-          successfulRecovery = doReplicateOnlyRecovery(core);
+          successfulRecovery = doReplicateOnlyRecovery(core, leader);
         }
 
         if (successfulRecovery) {
@@ -384,7 +383,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
     }
   }
 
-  final private boolean doReplicateOnlyRecovery(SolrCore core) throws Exception {
+  final private boolean doReplicateOnlyRecovery(SolrCore core, Replica leader) throws Exception {
     boolean successfulRecovery = false;
 
     // if (core.getUpdateHandler().getUpdateLog() != null) {
@@ -397,15 +396,18 @@ public class RecoveryStrategy implements Runnable, Closeable {
     log.info("Publishing state of core [{}] as recovering {}", coreName, "doReplicateOnlyRecovery");
 
     zkController.publish(this.coreDescriptor, Replica.State.RECOVERING);
-
+    int cnt = 0;
     while (!successfulRecovery && !isClosed()) { // don't use interruption or
       // it will close channels
       // though
+      cnt++;
       try {
         CloudDescriptor cloudDesc = this.coreDescriptor.getCloudDescriptor();
-        Replica leader;
+
         try {
-          leader = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId(), 1500, false);
+          if (cnt > 1) {
+            leader = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId(), 3000, false);
+          }
 
           if (leader != null && leader.getName().equals(coreName)) {
             log.info("We are the leader, STOP recovery");
@@ -503,16 +505,9 @@ public class RecoveryStrategy implements Runnable, Closeable {
   }
 
   // TODO: perhaps make this grab a new core each time through the loop to handle core reloads?
-  public final boolean doSyncOrReplicateRecovery(SolrCore core) throws Exception {
+  public final boolean doSyncOrReplicateRecovery(SolrCore core, Replica leader) throws Exception {
     log.info("Do peersync or replication recovery core={} collection={}", coreName, coreDescriptor.getCollectionName());
 
-    Replica leader = zkController.getZkStateReader().getLeaderRetry(coreDescriptor.getCollectionName(), coreDescriptor.getCloudDescriptor().getShardId(), 1500);
-    if (leader != null && leader.getName().equals(coreName)) {
-      log.info("We are the leader, STOP recovery");
-      close = true;
-      throw new AlreadyClosedException();
-    }
-
     log.info("Publishing state of core [{}] as recovering {}", coreName, "doSyncOrReplicateRecovery");
 
     zkController.publish(this.coreDescriptor, Replica.State.RECOVERING);
@@ -602,8 +597,10 @@ public class RecoveryStrategy implements Runnable, Closeable {
     while (!successfulRecovery && !isClosed()) {
       try {
         CloudDescriptor cloudDesc = this.coreDescriptor.getCloudDescriptor();
-        leader = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId(), 1500);
 
+        if (!firstTime) {
+          leader = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId(), 3000, false);
+        }
         if (leader != null && leader.getName().equals(coreName)) {
           log.info("We are the leader, STOP recovery");
           close = true;
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index b2abf8f..e5dd8d3 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -170,7 +170,7 @@ public class ZkController implements Closeable, Runnable {
 
   @Override
   public void run() {
-    disconnect(false);
+    disconnect(true);
   }
 
   public boolean isDcCalled() {
@@ -628,7 +628,7 @@ public class ZkController implements Closeable, Runnable {
   }
 
   public void disconnect(boolean publishDown) {
-    if (log.isDebugEnabled()) log.debug("disconnect");
+    log.info("disconnect");
     this.dcCalled = true;
 
     try {
@@ -650,7 +650,6 @@ public class ZkController implements Closeable, Runnable {
           }
           return "PublishDown";
         });
-        closer.collect();
       }
 
       closer.collect(leaderElectors);
@@ -1432,7 +1431,7 @@ public class ZkController implements Closeable, Runnable {
 //            throw new AlreadyClosedException();
 //          }
 
-          leader = zkStateReader.getLeaderRetry(collection, shardId, 1500, false);
+          leader = zkStateReader.getLeaderRetry(collection, shardId, 3000, false);
 
         } catch (TimeoutException timeoutException) {
            log.info("Timeout waiting to see leader, retry");
@@ -1569,44 +1568,6 @@ public class ZkController implements Closeable, Runnable {
     }
   }
 
-  // timeoutms is the timeout for the first call to get the leader - there is then
-  // a longer wait to make sure that leader matches our local state
-  private String getLeader(final CloudDescriptor cloudDesc, int timeoutms) {
-
-    String collection = cloudDesc.getCollectionName();
-    String shardId = cloudDesc.getShardId();
-    // rather than look in the cluster state file, we go straight to the zknodes
-    // here, because on cluster restart there could be stale leader info in the
-    // cluster state node that won't be updated for a moment
-    String leaderUrl;
-    try {
-      leaderUrl = getLeaderProps(collection, cloudDesc.getShardId(), timeoutms)
-              .getCoreUrl();
-
-      zkStateReader.waitForState(collection, timeoutms * 2, TimeUnit.MILLISECONDS, (n, c) -> checkLeaderUrl(cloudDesc, leaderUrl, collection, shardId, leaderConflictResolveWait));
-
-    } catch (Exception e) {
-      ParWork.propagateInterrupt(e);
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Error getting leader from zk", e);
-    }
-    return leaderUrl;
-  }
-
-  private boolean checkLeaderUrl(CloudDescriptor cloudDesc, String leaderUrl, String collection, String shardId,
-                                 int timeoutms) {
-    // now wait until our currently cloud state contains the latest leader
-    String clusterStateLeaderUrl;
-    try {
-      clusterStateLeaderUrl = zkStateReader.getLeaderUrl(collection, shardId, 10000);
-
-      // leaderUrl = getLeaderProps(collection, cloudDesc.getShardId(), timeoutms).getCoreUrl();
-    } catch (Exception e) {
-      ParWork.propagateInterrupt(e);
-      throw new SolrException(ErrorCode.SERVER_ERROR, e);
-    }
-    return clusterStateLeaderUrl != null;
-  }
-
   /**
    * Get leader props directly from zk nodes.
    * @throws SessionExpiredException on zk session expiration.
@@ -1656,6 +1617,7 @@ public class ZkController implements Closeable, Runnable {
     Map<String, Object> props = new HashMap<>();
     // we only put a subset of props into the leader node
     props.put(ZkStateReader.NODE_NAME_PROP, getNodeName());
+    props.put(CORE_NAME_PROP, cd.getName());
 
     Replica replica = new Replica(cd.getName(), props, collection, shardId, zkStateReader);
     LeaderElector leaderElector;
@@ -1751,9 +1713,6 @@ public class ZkController implements Closeable, Runnable {
       props.put(ZkStateReader.COLLECTION_PROP, collection);
       props.put(ZkStateReader.REPLICA_TYPE, cd.getCloudDescriptor().getReplicaType().toString());
 
-      if (numShards != null) {
-        props.put(ZkStateReader.NUM_SHARDS_PROP, numShards.toString());
-      }
       try (SolrCore core = cc.getCore(cd.getName())) {
         if (core != null && core.getDirectoryFactory().isSharedStorage()) {
           // nocommit
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkShardTerms.java b/solr/core/src/java/org/apache/solr/cloud/ZkShardTerms.java
index d9eb9a3..2adfe5e 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkShardTerms.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkShardTerms.java
@@ -24,7 +24,6 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicReference;
 
@@ -99,7 +98,7 @@ public class ZkShardTerms implements Closeable {
     void close();
   }
 
-  public ZkShardTerms(String collection, String shard, SolrZkClient zkClient) throws IOException{
+  public ZkShardTerms(String collection, String shard, SolrZkClient zkClient) throws IOException, KeeperException {
     this.znodePath = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/terms/" + shard;
     this.collection = collection;
     this.shard = shard;
@@ -223,7 +222,7 @@ public class ZkShardTerms implements Closeable {
    */
   void registerTerm(String coreNodeName) throws KeeperException, InterruptedException {
     ShardTerms newTerms;
-    while ( (newTerms = terms.get().registerTerm(coreNodeName)) != null) {
+    while ((newTerms = terms.get().registerTerm(coreNodeName)) != null) {
       if (forceSaveTerms(newTerms)) break;
     }
   }
@@ -324,6 +323,7 @@ public class ZkShardTerms implements Closeable {
    */
   private boolean saveTerms(ShardTerms newTerms) throws KeeperException, InterruptedException {
     byte[] znodeData = Utils.toJSON(newTerms);
+
     try {
       Stat stat = zkClient.setData(znodePath, znodeData, newTerms.getVersion(), true);
       ShardTerms newShardTerms = new ShardTerms(newTerms, stat.getVersion());
@@ -331,7 +331,12 @@ public class ZkShardTerms implements Closeable {
       if (log.isDebugEnabled()) log.debug("Successful update of terms at {} to {}", znodePath, newTerms);
       return true;
     } catch (KeeperException.BadVersionException e) {
-      log.info("Failed to save terms, version is not a match, retrying version={}", newTerms.getVersion());
+      int foundVersion = -1;
+      Stat stat = zkClient.exists(znodePath, null);
+      if (stat != null) {
+        foundVersion = stat.getVersion();
+      }
+      log.info("Failed to save terms, version is not a match, retrying version={} found={}", newTerms.getVersion(), foundVersion);
 
       if (isClosed.get()) {
         throw new AlreadyClosedException();
@@ -353,8 +358,11 @@ public class ZkShardTerms implements Closeable {
         if (Watcher.Event.EventType.None == event.getType()) {
           return;
         }
-        if (event.getType() == Watcher.Event.EventType.NodeCreated || event.getType() == Watcher.Event.EventType.NodeDataChanged) {
+
+        try {
           retryRegisterWatcher();
+        } catch (KeeperException e) {
+          log.warn("Exception refreshing terms on watcher event", e);
         }
       };
       Stat stat = new Stat();
@@ -365,7 +373,7 @@ public class ZkShardTerms implements Closeable {
     } catch (KeeperException.NoNodeException e) {
       log.warn("No node found for shard terms", e);
       // we have likely been deleted
-      return;
+      throw new AlreadyClosedException("No node found for shard terms");
     } catch (InterruptedException e) {
       ParWork.propagateInterrupt(e);
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error updating shard term for collection: " + collection, e);
@@ -377,26 +385,8 @@ public class ZkShardTerms implements Closeable {
   /**
    * Retry register a watcher to the correspond ZK term node
    */
-  private void retryRegisterWatcher() {
-    while (!isClosed.get()) {
-      try {
-        refreshTerms(true);
-        return;
-      } catch (KeeperException.AuthFailedException e) {
-        isClosed.set(true);
-        log.error("Failed watching shard term for collection: {} due to unrecoverable exception", collection, e);
-        return;
-      } catch (KeeperException e) {
-        log.warn("Failed watching shard term for collection: {}, retrying!", collection, e);
-        try {
-          zkClient.getConnectionManager().waitForConnected(zkClient.getZkClientTimeout());
-        } catch (TimeoutException | InterruptedException te) {
-          if (Thread.interrupted()) {
-            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error watching shard term for collection: " + collection, te);
-          }
-        }
-      }
-    }
+  private void retryRegisterWatcher() throws KeeperException {
+    refreshTerms(true);
   }
 
   /**
diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java
index 80013b4..f5acb9a 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java
@@ -56,6 +56,7 @@ public class ZkStateWriter {
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
   private final ZkStateReader reader;
+  private final Overseer overseer;
 
   /**
    * Represents a no-op {@link ZkWriteCommand} which will result in no modification to cluster state
@@ -85,9 +86,9 @@ public class ZkStateWriter {
   private Set<String> dirtyStructure = new HashSet<>();
   private Set<String> dirtyState = new HashSet<>();
 
-  public ZkStateWriter(ZkStateReader zkStateReader, Stats stats) {
+  public ZkStateWriter(ZkStateReader zkStateReader, Stats stats, Overseer overseer) {
     assert zkStateReader != null;
-
+    this.overseer = overseer;
     this.reader = zkStateReader;
     this.stats = stats;
 
@@ -225,7 +226,7 @@ public class ZkStateWriter {
                   List<Replica> replicas = docColl.getReplicas();
                   for (Replica replica : replicas) {
                     if (replica.getState() != Replica.State.DOWN && replica.getNodeName().equals(entry.getValue())) {
-                      log.info("set downnode for replica {}", replica);
+                      if (log.isDebugEnabled()) log.debug("set downnode for replica {}", replica);
                       // nocommit
                       Slice slice = docColl.getSlice(replica.getSlice());
                       slice.setLeader(null);
@@ -297,6 +298,7 @@ public class ZkStateWriter {
                         docColl.getSlice(replica).setLeader(null);
                       }
                       updates.getProperties().put(replica.getName(), Replica.State.getShortState(state));
+                      updates.getProperties().remove("leader");
                       // log.info("set state {} {}", state, replica);
                       replica.setState(state);
                       dirtyState.add(collection);
@@ -352,130 +354,154 @@ public class ZkStateWriter {
   // if additional updates too large, publish structure changew
   public void writePendingUpdates() {
 
-   // writeLock.lock();
-   // try {
-   //   log.info("Get our write lock");
-      ourLock.lock();
+    do {
       try {
-   //     log.info("Got our write lock");
+        write();
+      } catch (KeeperException.BadVersionException e) {
+        continue;
+      }
+      break;
+    } while (!overseer.isClosed());
 
-        throttle.minimumWaitBetweenActions();
-        throttle.markAttemptingAction();
+  }
 
-        if (log.isTraceEnabled()) {
-          log.trace("writePendingUpdates {}", cs);
-        }
+  private void write() throws KeeperException.BadVersionException {
+    // writeLock.lock();
+    // try {
+    //   log.info("Get our write lock");
+    ourLock.lock();
+    try {
+ //     log.info("Got our write lock");
 
-        if (failedUpdates.size() > 0) {
-          log.warn("Some collection updates failed {} logging last exception", failedUpdates, lastFailedException); // nocommit expand
-          failedUpdates.clear();
-          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, lastFailedException.get());
-        }
+      throttle.minimumWaitBetweenActions();
+      throttle.markAttemptingAction();
+
+      if (log.isTraceEnabled()) {
+        log.trace("writePendingUpdates {}", cs);
+      }
+
+      if (failedUpdates.size() > 0) {
+        log.warn("Some collection updates failed {} logging last exception", failedUpdates, lastFailedException); // nocommit expand
+        failedUpdates.clear();
+        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, lastFailedException.get());
+      }
 //      } finally {
 //        ourLock.unlock();
 //      }
 
-      // wait to see our last publish version has propagated TODO don't wait on collections not hosted on overseer?
-      // waitForStateWePublishedToComeBack();
-
-   //   ourLock.lock();
-      AtomicInteger lastVersion = new AtomicInteger();
-      //log.info("writing out state, looking at collections count={} toWrite={} {} : {}", cs.getCollectionsMap().size(), collectionsToWrite.size(), cs.getCollectionsMap().keySet(), collectionsToWrite);
-      //try {
-        cs.forEachCollection(collection -> {
-         // log.info("check collection {}", collection);
-          if (dirtyStructure.contains(collection.getName()) || dirtyState.contains(collection.getName())) {
-          //  log.info("process collection {}", collection);
-            String name = collection.getName();
-            String path = ZkStateReader.getCollectionPath(collection.getName());
-            String pathSCN = ZkStateReader.getCollectionSCNPath(collection.getName());
-           // log.info("process collection {} path {}", collection.getName(), path);
-            Stat existsStat = null;
-            if (log.isTraceEnabled()) log.trace("process {}", collection);
+    // wait to see our last publish version has propagated TODO don't wait on collections not hosted on overseer?
+    // waitForStateWePublishedToComeBack();
+
+ //   ourLock.lock();
+    AtomicInteger lastVersion = new AtomicInteger();
+    AtomicReference<KeeperException.BadVersionException> badVersionException = new AtomicReference();
+    //log.info("writing out state, looking at collections count={} toWrite={} {} : {}", cs.getCollectionsMap().size(), collectionsToWrite.size(), cs.getCollectionsMap().keySet(), collectionsToWrite);
+    //try {
+      cs.forEachCollection(collection -> {
+       // log.info("check collection {}", collection);
+        Integer version = null;
+        if (dirtyStructure.contains(collection.getName()) || dirtyState.contains(collection.getName())) {
+        //  log.info("process collection {}", collection);
+          String name = collection.getName();
+          String path = ZkStateReader.getCollectionPath(collection.getName());
+          String pathSCN = ZkStateReader.getCollectionSCNPath(collection.getName());
+         // log.info("process collection {} path {}", collection.getName(), path);
+          Stat existsStat = null;
+          if (log.isTraceEnabled()) log.trace("process {}", collection);
+          try {
+           // log.info("get data for {}", name);
+            byte[] data = Utils.toJSON(singletonMap(name, collection));
+          //  log.info("got data for {} {}", name, data.length);
+
             try {
-             // log.info("get data for {}", name);
-              byte[] data = Utils.toJSON(singletonMap(name, collection));
-            //  log.info("got data for {} {}", name, data.length);
+              Integer v = trackVersions.get(collection.getName());
 
-              try {
-                Integer version = null;
-                Integer v = trackVersions.get(collection.getName());
+              if (v != null) {
+                //log.info("got version from cache {}", v);
+                version = v;
+              } else {
+                version = 0;
+              }
+              lastVersion.set(version);
+              if (log.isDebugEnabled()) log.debug("Write state.json prevVersion={} bytes={} col={}", version, data.length, collection);
 
-                if (v != null) {
-                  //log.info("got version from cache {}", v);
-                  version = v;
-                } else {
-                  version = 0;
-                }
-                lastVersion.set(version);
-                if (log.isDebugEnabled()) log.debug("Write state.json prevVersion={} bytes={} col={}", version, data.length, collection);
-
-                reader.getZkClient().setData(path, data, version, true);
-                trackVersions.put(collection.getName(), version + 1);
-                if (dirtyStructure.contains(collection.getName())) {
-                  if (log.isDebugEnabled()) log.debug("structure change in {}", collection.getName());
-                  dirtyStructure.remove(collection.getName());
-                  reader.getZkClient().setData(pathSCN, null, -1, true);
-
-                  ZkNodeProps updates = stateUpdates.get(collection.getName());
-                  if (updates != null) {
-                    updates.getProperties().clear();
-                  }
-                }
+              reader.getZkClient().setData(path, data, version, true);
+              trackVersions.put(collection.getName(), version + 1);
+              if (dirtyStructure.contains(collection.getName())) {
+                if (log.isDebugEnabled()) log.debug("structure change in {}", collection.getName());
+                dirtyStructure.remove(collection.getName());
+                reader.getZkClient().setData(pathSCN, null, -1, true);
 
-              } catch (KeeperException.NoNodeException e) {
-                if (log.isDebugEnabled()) log.debug("No node found for state.json", e);
+                ZkNodeProps updates = stateUpdates.get(collection.getName());
+                if (updates != null) {
+                  updates.getProperties().clear();
+                }
+              }
 
-                lastVersion.set(-1);
-              //  trackVersions.remove(collection.getName());
-                // likely deleted
-                return;
+            } catch (KeeperException.NoNodeException e) {
+              if (log.isDebugEnabled()) log.debug("No node found for state.json", e);
 
-              } catch (KeeperException.BadVersionException bve) {
-                //lastFailedException.set(bve);
-                //failedUpdates.put(collection.getName(), collection);
-               // Stat estate = reader.getZkClient().exists(path, null);
-                trackVersions.remove(collection.getName());
-                throw bve;
+              lastVersion.set(-1);
+            //  trackVersions.remove(collection.getName());
+              // likely deleted
+              return;
 
-              }
+            } catch (KeeperException.BadVersionException bve) {
+              //lastFailedException.set(bve);
+              //failedUpdates.put(collection.getName(), collection);
+             // Stat estate = reader.getZkClient().exists(path, null);
+              trackVersions.remove(collection.getName());
+              Stat stat = reader.getZkClient().exists(path, null);
+              log.error("Tried to update state.json with bad version {} \n {}", version, stat != null ? stat.getVersion() : "null");
 
-              if (dirtyState.contains(collection.getName())) {
-                ZkNodeProps updates = stateUpdates.get(collection.getName());
-                if (updates != null) {
-                  String stateUpdatesPath = ZkStateReader.getCollectionStateUpdatesPath(collection.getName());
-                  if (log.isDebugEnabled()) log.debug("write state updates for collection {} {}", collection.getName(), updates);
-                  dirtyState.remove(collection.getName());
-                  reader.getZkClient().setData(stateUpdatesPath, Utils.toJSON(updates), -1, true);
-                }
+              if (!overseer.isClosed() && stat != null) {
+                trackVersions.put(collection.getName(), stat.getVersion());
               }
 
-            } catch (InterruptedException | AlreadyClosedException e) {
-              log.info("We have been closed or one of our resources has, bailing {}", e.getClass().getSimpleName() + ":" + e.getMessage());
+              throw bve;
 
-            } catch (Exception e) {
-              log.error("Failed processing update=" + collection, e);
             }
-          }
 
-        });
+            if (dirtyState.contains(collection.getName())) {
+              ZkNodeProps updates = stateUpdates.get(collection.getName());
+              if (updates != null) {
+                String stateUpdatesPath = ZkStateReader.getCollectionStateUpdatesPath(collection.getName());
+                if (log.isDebugEnabled()) log.debug("write state updates for collection {} {}", collection.getName(), updates);
+                dirtyState.remove(collection.getName());
+                reader.getZkClient().setData(stateUpdatesPath, Utils.toJSON(updates), -1, true);
+              }
+            }
 
+          } catch (KeeperException.BadVersionException bve) {
+            badVersionException.set(bve);
+            return;
+          } catch (InterruptedException | AlreadyClosedException e) {
+            log.info("We have been closed or one of our resources has, bailing {}", e.getClass().getSimpleName() + ":" + e.getMessage());
 
+          } catch (Exception e) {
+            log.error("Failed processing update=" + collection, e);
+          }
+        }
 
-        //log.info("Done with successful cluster write out");
+      });
 
-      } finally {
-        ourLock.unlock();
+      if (badVersionException.get() != null) {
+        throw badVersionException.get();
       }
-//    } finally {
-//      writeLock.unlock();
-//    }
+
+      //log.info("Done with successful cluster write out");
+
+    } finally {
+      ourLock.unlock();
+    }
+    //    } finally {
+    //      writeLock.unlock();
+    //    }
     // nocommit - harden against failures and exceptions
 
     //    if (log.isDebugEnabled()) {
     //      log.debug("writePendingUpdates() - end - New Cluster State is: {}", newClusterState);
     //    }
-
   }
 
   private void waitForStateWePublishedToComeBack() {
diff --git a/solr/core/src/java/org/apache/solr/core/CachingDirectoryFactory.java b/solr/core/src/java/org/apache/solr/core/CachingDirectoryFactory.java
index 28c8028..430ba55 100644
--- a/solr/core/src/java/org/apache/solr/core/CachingDirectoryFactory.java
+++ b/solr/core/src/java/org/apache/solr/core/CachingDirectoryFactory.java
@@ -540,7 +540,7 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
       //    }
       cacheValue.refCnt--;
 
-      if (cacheValue.refCnt == 0 && cacheValue.doneWithDir ||  closed) {
+      if (cacheValue.refCnt == 0 && cacheValue.doneWithDir || closed) {
         boolean cl = closeCacheValue(cacheValue);
         if (cl) {
           removeFromCache(cacheValue);
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index 26f6a17..f4e223b 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -25,7 +25,6 @@ import org.apache.http.config.Lookup;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.store.Directory;
 import org.apache.solr.client.solrj.SolrClient;
 import org.apache.solr.client.solrj.cloud.SolrCloudManager;
 import org.apache.solr.client.solrj.impl.CloudHttp2SolrClient;
@@ -40,7 +39,6 @@ import org.apache.solr.cloud.ZkController;
 import org.apache.solr.cloud.overseer.OverseerAction;
 import org.apache.solr.common.AlreadyClosedException;
 import org.apache.solr.common.ParWork;
-import org.apache.solr.common.PerThreadExecService;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.cloud.DocCollection;
@@ -55,13 +53,12 @@ import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.common.util.ObjectCache;
 import org.apache.solr.common.util.ObjectReleaseTracker;
 import org.apache.solr.common.util.OrderedExecutor;
+import org.apache.solr.common.util.SysStats;
 import org.apache.solr.common.util.Utils;
-import org.apache.solr.core.DirectoryFactory.DirContext;
 import org.apache.solr.core.backup.repository.BackupRepository;
 import org.apache.solr.core.backup.repository.BackupRepositoryFactory;
 import org.apache.solr.filestore.PackageStoreAPI;
 import org.apache.solr.handler.RequestHandlerBase;
-import org.apache.solr.handler.SnapShooter;
 import org.apache.solr.handler.admin.CollectionsHandler;
 import org.apache.solr.handler.admin.ConfigSetsHandler;
 import org.apache.solr.handler.admin.CoreAdminHandler;
@@ -125,12 +122,10 @@ import java.nio.file.NoSuchFileException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.security.spec.InvalidKeySpecException;
-import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Comparator;
-import java.util.Date;
 import java.util.HashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
@@ -193,9 +188,7 @@ public class CoreContainer implements Closeable {
 
   private volatile UpdateShardHandler updateShardHandler;
 
-  public volatile ExecutorService solrCoreLoadExecutor;
-
-  public volatile ExecutorService solrCoreCloseExecutor;
+  public volatile ExecutorService solrCoreExecutor;
 
   private final OrderedExecutor replayUpdatesExecutor;
 
@@ -404,11 +397,8 @@ public class CoreContainer implements Closeable {
 
     containerProperties.putAll(cfg.getSolrProperties());
 
-    solrCoreLoadExecutor = new PerThreadExecService(ParWork.getRootSharedExecutor(), Math.max(16, Runtime.getRuntime().availableProcessors()),
-        false, false);
-
-    solrCoreCloseExecutor = new PerThreadExecService(ParWork.getRootSharedExecutor(), Math.max(16, Runtime.getRuntime().availableProcessors()),
-        false, false);
+    solrCoreExecutor = ParWork.getParExecutorService("SolrCoreExecutor",
+        1, SysStats.PROC_COUNT * 2, 3000, new LinkedBlockingQueue<>(1024));
   }
 
   @SuppressWarnings({"unchecked"})
@@ -903,7 +893,7 @@ public class CoreContainer implements Closeable {
       }
       if (cd.isLoadOnStartup()) {
 
-        coreLoadFutures.add(solrCoreLoadExecutor.submit(() -> {
+        coreLoadFutures.add(solrCoreExecutor.submit(() -> {
           SolrCore core = null;
           MDCLoggingContext.setCoreDescriptor(this, cd);
           try {
@@ -1075,10 +1065,6 @@ public class CoreContainer implements Closeable {
         replayUpdatesExecutor.shutdownAndAwaitTermination();
       });
 
-      if (solrCoreLoadExecutor != null) {
-        solrCoreLoadExecutor.shutdown();
-      }
-
       List<Callable<?>> callables = new ArrayList<>();
 
       if (metricManager != null) {
@@ -1146,20 +1132,15 @@ public class CoreContainer implements Closeable {
       closer.collect(callables);
       closer.collect(metricsHistoryHandler);
 
-
-      closer.collect(solrCoreLoadExecutor);
-
-
       closer.collect("WaitForSolrCores", solrCores);
 
-
       closer.addCollect();
 
       closer.collect(shardHandlerFactory);
       closer.collect(updateShardHandler);
 
 
-      closer.collect(solrCoreCloseExecutor);
+      closer.collect(solrCoreExecutor);
       closer.collect(solrClientCache);
 
       closer.collect(loader);
@@ -1415,7 +1396,7 @@ public class CoreContainer implements Closeable {
               zkSys.getZkController().throwErrorIfReplicaReplaced(dcore);
             }
           }
-          new ZkController.RegisterCoreAsync(zkSys.zkController, dcore, false).call();
+          ParWork.getRootSharedExecutor().submit(new ZkController.RegisterCoreAsync(zkSys.zkController, dcore, false));
         }
 
       } catch (Exception e) {
@@ -1459,11 +1440,11 @@ public class CoreContainer implements Closeable {
             if (core != null) {
 
               SolrCore finalCore1 = core;
-              solrCoreCloseExecutor.submit(() -> {
+              solrCoreExecutor.submit(() -> {
                 finalCore1.closeAndWait();
               });
               SolrCore finalOld = old;
-              solrCoreCloseExecutor.submit(() -> {
+              solrCoreExecutor.submit(() -> {
                 if (finalOld != null) {
                   finalOld.closeAndWait();
                 }
@@ -1511,6 +1492,10 @@ public class CoreContainer implements Closeable {
   private SolrCore processCoreCreateException(Exception original, CoreDescriptor dcore, ConfigSet coreConfig) {
     log.error("Error creating SolrCore", original);
 
+    if (isShutDown) {
+      return null;
+    }
+
     // Traverse full chain since CIE may not be root exception
     Throwable cause = original;
     if (!(cause instanceof  CorruptIndexException)) {
@@ -1544,10 +1529,31 @@ public class CoreContainer implements Closeable {
                 .getLeader();
             if (leader != null && leader.getState() == State.ACTIVE) {
               log.info("Found active leader, will attempt to create fresh core and recover.");
-              resetIndexDirectory(dcore, coreConfig);
+
+              SolrConfig config = coreConfig.getSolrConfig();
+
+              String registryName = SolrMetricManager.getRegistryName(SolrInfoBean.Group.core, dcore.getName());
+              DirectoryFactory df = DirectoryFactory.loadDirectoryFactory(config, this, registryName);
+              String dataDir = SolrCore.findDataDir(df, null, config, dcore);
+              df.close();
+
+              try {
+                while (new File(dataDir).exists()) {
+                  try {
+                    Files.walk(new File(dataDir).toPath()).sorted(Comparator.reverseOrder()).map(Path::toFile).forEach(File::delete);
+                  } catch (NoSuchFileException e) {
+
+                  }
+                }
+              } catch (Exception e) {
+                SolrException.log(log, "Failed to delete instance dir for core:" + dcore.getName() + " dir:" + dcore.getInstanceDir());
+              }
+
+              SolrCore core = new SolrCore(this, dcore, coreConfig);
+              core.getUpdateHandler().getUpdateLog().deleteAll();
+
               // the index of this core is emptied, its term should be set to 0
               getZkController().getShardTerms(desc.getCollectionName(), desc.getShardId()).setTermToZero(dcore.getName());
-              return new SolrCore(this, dcore, coreConfig);
             }
           } catch (Exception se) {
             se.addSuppressed(original);
@@ -1581,35 +1587,6 @@ public class CoreContainer implements Closeable {
   }
 
   /**
-   * Write a new index directory for the a SolrCore, but do so without loading it.
-   */
-  private void resetIndexDirectory(CoreDescriptor dcore, ConfigSet coreConfig) {
-    SolrConfig config = coreConfig.getSolrConfig();
-
-    String registryName = SolrMetricManager.getRegistryName(SolrInfoBean.Group.core, dcore.getName());
-    DirectoryFactory df = DirectoryFactory.loadDirectoryFactory(config, this, registryName);
-    String dataDir = SolrCore.findDataDir(df, null, config, dcore);
-
-    String tmpIdxDirName = "index." + new SimpleDateFormat(SnapShooter.DATE_FMT, Locale.ROOT).format(new Date());
-    SolrCore.modifyIndexProps(df, dataDir, config, tmpIdxDirName);
-
-    // Free the directory object that we had to create for this
-    Directory dir = null;
-    try {
-      dir = df.get(dataDir, DirContext.META_DATA, config.indexConfig.lockType);
-    } catch (IOException e) {
-      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-    } finally {
-      try {
-        df.doneWithDirectory(dir);
-        df.release(dir);
-      } catch (IOException e) {
-        SolrException.log(log, e);
-      }
-    }
-  }
-
-  /**
    * @return a Collection of registered SolrCores
    */
   public Collection<SolrCore> getCores() {
@@ -1821,7 +1798,7 @@ public class CoreContainer implements Closeable {
           if (!success) {
             log.error("Failed reloading core, cleaning up new core");
             SolrCore finalNewCore = newCore;
-            solrCoreCloseExecutor.submit(() -> {
+            solrCoreExecutor.submit(() -> {
               //            try {
               if (finalNewCore != null) {
                 log.error("Closing failed new core");
diff --git a/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java b/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java
index 4e4e2f8..3f9d586 100644
--- a/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java
+++ b/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java
@@ -171,8 +171,8 @@ public class CorePropertiesLocator implements CoresLocator {
       log.info("Found {} core definitions underneath {}", cds.size(), rootDirectory);
     }
     if (cds.size() > 0) {
-      if (log.isInfoEnabled()) {
-        log.info("Cores are: {}", cds.stream().map(CoreDescriptor::getName).collect(Collectors.toList()));
+      if (log.isDebugEnabled()) {
+        log.debug("Cores are: {}", cds.stream().map(CoreDescriptor::getName).collect(Collectors.toList()));
       }
     }
     return cds;
diff --git a/solr/core/src/java/org/apache/solr/core/SolrCore.java b/solr/core/src/java/org/apache/solr/core/SolrCore.java
index 29bc5bc..6d44700 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrCore.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrCore.java
@@ -173,6 +173,7 @@ import java.util.concurrent.Callable;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
@@ -900,8 +901,7 @@ public final class SolrCore implements SolrInfoBean, Closeable {
       log.debug("{}Solr index directory '{}' doesn't exist. Creating new index...", logid, indexDir);
 
       try (SolrIndexWriter writer = SolrIndexWriter.buildIndexWriter(this, "SolrCore.initIndex", indexDir, getDirectoryFactory(),
-              true, getLatestSchema(), solrConfig.indexConfig, solrDelPolicy, codec)) {
-        writer.commit();
+              true, getLatestSchema(), solrConfig.indexConfig, solrDelPolicy, codec, true)) {
       } catch (Exception e) {
         ParWork.propagateInterrupt(e);
         throw new SolrException(ErrorCode.SERVER_ERROR, e);
@@ -1228,15 +1228,15 @@ public final class SolrCore implements SolrInfoBean, Closeable {
       searcherReadyLatch.countDown();
 
       // nocommit - wait before publish active
-//      if (!getSolrConfig().useColdSearcher) {
-//        try {
-//          initSearcherFuture[0].get();
-//        } catch (InterruptedException e) {
-//          log.error("", e);
-//        } catch (ExecutionException e) {
-//          log.error("", e);
-//        }
-//      }
+      if (!getSolrConfig().useColdSearcher) {
+        try {
+          initSearcherFuture[0].get();
+        } catch (InterruptedException e) {
+          log.error("", e);
+        } catch (ExecutionException e) {
+          log.error("", e);
+        }
+      }
     }
 
 
@@ -1295,7 +1295,7 @@ public final class SolrCore implements SolrInfoBean, Closeable {
     }
     Future[] waitSearcher = new Future[1];
     try {
-      getSearcher(false, false, null, true);
+      getSearcher(false, false, waitSearcher, true);
     } finally {
       newReaderCreator = null;
       if (iwRef != null) {
diff --git a/solr/core/src/java/org/apache/solr/core/SolrCores.java b/solr/core/src/java/org/apache/solr/core/SolrCores.java
index 839d464..1c12281 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrCores.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrCores.java
@@ -115,7 +115,7 @@ class SolrCores implements Closeable {
     }
 
     cores.forEach((s, solrCore) -> {
-      container.solrCoreCloseExecutor.submit(() -> {
+      container.solrCoreExecutor.submit(() -> {
         MDCLoggingContext.setCore(solrCore);
         try {
           solrCore.closeAndWait();
diff --git a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
index 07887b3..ed01ffd 100644
--- a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
+++ b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
@@ -18,6 +18,7 @@ package org.apache.solr.handler;
 
 import com.google.common.base.Strings;
 import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.IndexCommit;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.SegmentInfos;
@@ -520,7 +521,7 @@ public class IndexFetcher {
       }
 
       // Create the sync service
-      fsyncService = ParWork.getExecutorService(15);
+      fsyncService = ParWork.getExecutorService(4);
       // use a synchronized list because the list is read by other threads (to show details)
       filesDownloaded = Collections.synchronizedList(new ArrayList<Map<String, Object>>());
       // if the generation of master is older than that of the slave , it means they are not compatible to be copied
@@ -724,7 +725,7 @@ public class IndexFetcher {
     ZkController zkController = solrCore.getCoreContainer().getZkController();
     CloudDescriptor cd = solrCore.getCoreDescriptor().getCloudDescriptor();
     Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(
-        cd.getCollectionName(), cd.getShardId(), 1500, false);
+        cd.getCollectionName(), cd.getShardId(), 3000, false);
     return leaderReplica;
   }
 
@@ -1055,7 +1056,7 @@ public class IndexFetcher {
       log.warn("WARNING: clearing disk space ahead of time to avoid running out of space, could cause problems with current SolrCore approxTotalSpaceReqd{}, usableSpace={}", atsr, usableSpace);
       deleteFilesInAdvance(indexDir, indexDirPath, totalSpaceRequired, usableSpace);
     }
-    log.info("Files to download {}", filesToDownload);
+    if (log.isDebugEnabled()) log.debug("Files to download {}", filesToDownload);
     try {
       // nocommit
       try (ParWork parWork = new ParWork(this, true)) {
@@ -1113,7 +1114,7 @@ public class IndexFetcher {
               if (stop) {
                 throw new AlreadyClosedException();
               }
-              log.info("Downloaded {}", tmpIndexDir, file.get(NAME));
+              if (log.isDebugEnabled()) log.debug("Downloaded {}", tmpIndexDir, file.get(NAME));
               filesDownloaded.add(Collections.unmodifiableMap(file));
             } else {
               if (log.isDebugEnabled()) {
@@ -1231,8 +1232,13 @@ public class IndexFetcher {
           try {
             indexFileChecksum = CodecUtil.retrieveChecksum(indexInput);
             compareResult.checkSummed = true;
+          } catch (CorruptIndexException e) {
+            log.warn("Could not retrieve checksum from file.", e);
+            compareResult.equal = false;
+            return compareResult;
           } catch (Exception e) {
             log.warn("Could not retrieve checksum from file.", e);
+            compareResult.equal = false;
           }
         }
 
@@ -1722,7 +1728,6 @@ public class IndexFetcher {
         //if cleanup succeeds . The file is downloaded fully
         fsyncServiceFuture = fsyncService.submit(() -> {
           try {
-            log.info("Close fetched file", file);
             file.close();
           } catch (Exception e) {
             fsyncException = e;
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminOperation.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminOperation.java
index 935ee4a..655a12b 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminOperation.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminOperation.java
@@ -22,6 +22,7 @@ import java.nio.file.Path;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Optional;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.commons.lang3.StringUtils;
 import org.apache.solr.cloud.CloudDescriptor;
@@ -99,8 +100,9 @@ enum CoreAdminOperation implements CoreAdminOp {
         log().warn("Will not create SolrCore, CoreContainer is shutdown");
         throw new AlreadyClosedException("Will not create SolrCore, CoreContainer is shutdown");
       }
-
+      long start = System.nanoTime();
       coreContainer.create(coreName, instancePath, coreParams, newCollection);
+      log().info("SolrCore {} created in {}ms", coreName, TimeUnit.NANOSECONDS.convert(System.nanoTime() - start, TimeUnit.MILLISECONDS));
 
       it.rsp.add("core", coreName);
     } finally {
@@ -283,6 +285,8 @@ enum CoreAdminOperation implements CoreAdminOp {
     }
   });
 
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
   final CoreAdminParams.CoreAdminAction action;
   final CoreAdminOp fun;
 
@@ -291,7 +295,7 @@ enum CoreAdminOperation implements CoreAdminOp {
     this.fun = fun;
   }
 
-  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
 
   static Logger log() {
     return log;
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
index ca07d0b..8434210 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java
@@ -69,7 +69,6 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
     try {
       coreContainer.getZkController().getZkStateReader().waitForState(collection, 5, TimeUnit.SECONDS, (n, c) -> {
         if (c == null) {
-          log.info("collection not found {}", collection);
           return false;
         }
 
@@ -80,8 +79,7 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
         if (replica != null) {
           isLive = coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName());
           if (replica.getState() == waitForState) {
-            // if (log.isDebugEnabled()) log.debug("replica={} state={} waitForState={}", replica, replica.getState(), waitForState);
-            log.info("replica={} state={} waitForState={} isLive={}", replica, replica.getState(), waitForState, coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName()));
+            if (log.isDebugEnabled()) log.debug("replica={} state={} waitForState={} isLive={}", replica, replica.getState(), waitForState, coreContainer.getZkController().getZkStateReader().isNodeLive(replica.getNodeName()));
             return true;
           }
         }
diff --git a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
index 2c2dd9c..11b3e0e 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
@@ -356,7 +356,7 @@ public class RealTimeGetComponent extends SearchComponent
     if (idStr == null) return;
     AtomicLong version = new AtomicLong();
     SolrInputDocument doc = getInputDocument(req.getCore(), new BytesRef(idStr), version, null, Resolution.DOC);
-    log.info("getInputDocument called for id={}, returning {}", idStr, doc);
+    if (log.isDebugEnabled()) log.debug("getInputDocument called for id={}, returning {}", idStr, doc);
     rb.rsp.add("inputDocument", doc);
     rb.rsp.add("version", version.get());
   }
@@ -970,7 +970,7 @@ public class RealTimeGetComponent extends SearchComponent
     // the mappings.
 
     for (int i=0; i<rb.slices.length; i++) {
-      log.info("LOOKUP_SLICE:{}={}", rb.slices[i], rb.shards[i]);
+      if (log.isDebugEnabled()) log.debug("LOOKUP_SLICE:{}={}", rb.slices[i], rb.shards[i]);
       if (lookup.equals(rb.slices[i]) || slice.equals(rb.slices[i])) {
         return new String[]{rb.shards[i]};
       }
@@ -1189,7 +1189,7 @@ public class RealTimeGetComponent extends SearchComponent
 
     // TODO: get this from cache instead of rebuilding?
     try (UpdateLog.RecentUpdates recentUpdates = ulog.getRecentUpdates()) {
-      log.info("Get updates  versionsRequested={} params={}", versions.size(), params);
+      if (log.isDebugEnabled()) log.debug("Get updates  versionsRequested={} params={}", versions.size(), params);
       for (Long version : versions) {
         try {
           Object o = recentUpdates.lookup(version);
diff --git a/solr/core/src/java/org/apache/solr/schema/ZkIndexSchemaReader.java b/solr/core/src/java/org/apache/solr/schema/ZkIndexSchemaReader.java
index c942a4a..2fed048 100644
--- a/solr/core/src/java/org/apache/solr/schema/ZkIndexSchemaReader.java
+++ b/solr/core/src/java/org/apache/solr/schema/ZkIndexSchemaReader.java
@@ -132,6 +132,8 @@ public class ZkIndexSchemaReader implements OnReconnect {
     public void close() throws IOException {
       try {
         schemaReader.zkClient.getSolrZooKeeper().removeWatches(schemaReader.managedSchemaPath, this, WatcherType.Any, true);
+      } catch (KeeperException.NoWatcherException e) {
+
       } catch (Exception e) {
         if (log.isDebugEnabled()) log.debug("could not remove watch {} {}", e.getClass().getSimpleName(), e.getMessage());
       }
diff --git a/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java b/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java
index f6b6f13..30efcc2 100644
--- a/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java
+++ b/solr/core/src/java/org/apache/solr/servlet/SolrQoSFilter.java
@@ -50,7 +50,7 @@ public class SolrQoSFilter extends QoSFilter {
   @Override
   public void init(FilterConfig filterConfig) {
     super.init(filterConfig);
-    _origMaxRequests = Integer.getInteger("solr.concurrentRequests.max", 10000);
+    _origMaxRequests = Integer.getInteger("solr.concurrentRequests.max", 1000);
     super.setMaxRequests(_origMaxRequests);
     super.setSuspendMs(Integer.getInteger("solr.concurrentRequests.suspendms", 30000));
     super.setWaitMs(Integer.getInteger("solr.concurrentRequests.waitms", 2000));
@@ -65,49 +65,11 @@ public class SolrQoSFilter extends QoSFilter {
     boolean imagePath = req.getPathInfo() != null && req.getPathInfo().startsWith("/img/");
     boolean externalRequest = !imagePath && (source == null || !source.equals(QoSParams.INTERNAL));
     if (log.isDebugEnabled()) log.debug("SolrQoSFilter {} {} {}", sysStats.getSystemLoad(), sysStats.getTotalUsage(), externalRequest);
+    log.info("SolrQoSFilter {} {} {}", sysStats.getSystemLoad(), sysStats.getTotalUsage(), externalRequest);
 
     if (externalRequest) {
       if (log.isDebugEnabled()) log.debug("external request"); //nocommit: remove when testing is done
-      double ourLoad = sysStats.getTotalUsage();
-      if (log.isDebugEnabled()) log.debug("Our individual load is {}", ourLoad);
-      double sLoad = sysStats.getSystemLoad();
-      if (ourLoad > SysStats.OUR_LOAD_HIGH) {
-
-        int cMax = getMaxRequests();
-        if (cMax > 5) {
-          int max = Math.max(5, (int) ((double) cMax * 0.30D));
-          log.warn("Our individual load is {}", ourLoad);
-          updateMaxRequests(max, sLoad, ourLoad);
-        }
-
-      } else {
-        // nocommit - deal with no supported, use this as a fail safe with high and low watermark?
-        if (ourLoad < 0.90 && sLoad < 1.6 && _origMaxRequests != getMaxRequests()) {
-          if (sLoad < 0.9) {
-            if (log.isDebugEnabled()) log.debug("set max concurrent requests to orig value {}", _origMaxRequests);
-            updateMaxRequests(_origMaxRequests, sLoad, ourLoad);
-          } else {
-            updateMaxRequests(Math.min(_origMaxRequests, getMaxRequests() * 3), sLoad, ourLoad);
-          }
-        } else {
-          if (ourLoad > 0.90 && sLoad > 1.5) {
-            int cMax = getMaxRequests();
-            if (cMax > 5) {
-              int max = Math.max(5, (int) ((double) cMax * 0.30D));
-            //  log.warn("System load is {} and our load is {} procs is {}, set max concurrent requests to {}", sLoad, ourLoad, SysStats.PROC_COUNT, max);
-              updateMaxRequests(max, sLoad, ourLoad);
-            }
-          } else if (ourLoad < 0.90 && sLoad < 2 && _origMaxRequests != getMaxRequests()) {
-            if (sLoad < 0.9) {
-              if (log.isDebugEnabled()) log.debug("set max concurrent requests to orig value {}", _origMaxRequests);
-              updateMaxRequests(_origMaxRequests, sLoad, ourLoad);
-            } else {
-              updateMaxRequests(Math.min(_origMaxRequests, getMaxRequests() * 3), sLoad, ourLoad);
-            }
-
-          }
-        }
-      }
+      checkLoad();
 
       //chain.doFilter(req, response);
       super.doFilter(req, response, chain);
@@ -118,11 +80,51 @@ public class SolrQoSFilter extends QoSFilter {
     }
   }
 
+  private void checkLoad() {
+    double ourLoad = sysStats.getTotalUsage();
+    int currentMaxRequests = getMaxRequests();
+    if (log.isDebugEnabled()) log.debug("Our individual load is {}", ourLoad);
+    double sLoad = sysStats.getSystemLoad();
+
+
+    if (lowStateLoad(sLoad, currentMaxRequests)) {
+//      if (log.isDebugEnabled()) log.debug("set max concurrent requests to orig value {}", _origMaxRequests);
+//      updateMaxRequests(_origMaxRequests, sLoad, ourLoad);
+//    } else {
+      updateMaxRequests(Math.min(_origMaxRequests, _origMaxRequests), sLoad, ourLoad);
+    } else {
+
+      if (hiLoadState(sLoad, currentMaxRequests)) {
+        int cMax = getMaxRequests();
+        int max = Math.max(5, (int) ((double) cMax * 0.90D));
+        updateMaxRequests(max, sLoad, ourLoad);
+      }
+    }
+      // nocommit - deal with no supported, use this as a fail safe with high and low watermark?
+  }
+
+  private boolean lowStateLoad(double sLoad, int currentMaxRequests) {
+    return currentMaxRequests < _origMaxRequests && sLoad < 1.0d;
+  }
+
+  private boolean hiLoadState(double sLoad, int currentMaxRequests) {
+    return sLoad > 9.0d;
+  }
+
   private void updateMaxRequests(int max, double sLoad, double ourLoad) {
-    if (System.currentTimeMillis() - lastUpdate > 2000) {
+    int currentMax = getMaxRequests();
+    if (max < currentMax) {
+      if (System.currentTimeMillis() - lastUpdate > 500) {
+        log.warn("Set max request to {} sload={} ourload={}", max, sLoad, ourLoad);
+        lastUpdate = System.currentTimeMillis();
+        setMaxRequests(max);
+      }
+    } else if (max > currentMax) {
+
       log.warn("Set max request to {} sload={} ourload={}", max, sLoad, ourLoad);
       lastUpdate = System.currentTimeMillis();
       setMaxRequests(max);
     }
+
   }
 }
\ No newline at end of file
diff --git a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
index 02ecd79..c8a271d 100644
--- a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
+++ b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
@@ -272,7 +272,7 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
     SolrIndexWriter iw;
     try {
       iw = SolrIndexWriter.buildIndexWriter(core, name, core.getNewIndexDir(), core.getDirectoryFactory(), false, core.getLatestSchema(),
-              core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec());
+              core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec(), false);
     } catch (Exception e) {
       ParWork.propagateInterrupt(e);
       throw new SolrException(ErrorCode.SERVER_ERROR, e);
@@ -432,7 +432,7 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
 
   @Override
   public void cancelRecovery(boolean wait, boolean prepForClose) {
-    log.info("Cancel recovery");
+    if (log.isDebugEnabled()) log.debug("Cancel recovery");
     recoverying = false;
     
     if (prepForClose) {
diff --git a/solr/core/src/java/org/apache/solr/update/PeerSync.java b/solr/core/src/java/org/apache/solr/update/PeerSync.java
index d24f905..a39c8db 100644
--- a/solr/core/src/java/org/apache/solr/update/PeerSync.java
+++ b/solr/core/src/java/org/apache/solr/update/PeerSync.java
@@ -140,6 +140,7 @@ public class PeerSync implements SolrMetricProducer {
   }
 
   public static long percentile(List<Long> arr, float frac) {
+    if (arr.size() == 0) return 0;
     int elem = (int) (arr.size() * frac);
     return Math.abs(arr.get(elem));
   }
@@ -206,25 +207,8 @@ public class PeerSync implements SolrMetricProducer {
         // we have no versions and hence no frame of reference to tell if we can use a peers
         // updates to bring us into sync
 
-        log.info("{} DONE. We have no versions. sync failed.", msg());
-
-        for (;;)  {
-          if (log.isDebugEnabled()) log.debug("looping in check for versions on others");
-          ShardResponse srsp = shardHandler.takeCompletedIncludingErrors();
-          if (srsp == null) break;
-          if (srsp.getException() == null)  {
-            if (log.isDebugEnabled()) log.debug("checking if others have versions {} {}", srsp.getSolrResponse().getResponse());
-            List<Long> otherVersions = (List<Long>)srsp.getSolrResponse().getResponse().get("versions");
-            if (otherVersions != null && !otherVersions.isEmpty())  {
-              if (syncErrors != null) syncErrors.inc();
-              if (log.isDebugEnabled()) log.debug("found another replica with versions");
-              return PeerSyncResult.failure(true);
-            }
-          }
-        }
-        if (syncErrors != null) syncErrors.inc();
-        if (log.isDebugEnabled()) log.debug("found no other replica with versions");
-        return PeerSyncResult.failure(false);
+        return failOnNoVersions();
+
       }
 
       MissedUpdatesFinder missedUpdatesFinder = new MissedUpdatesFinder(ourUpdates, msg(), nUpdates, ourLowThreshold, ourHighThreshold);
@@ -265,6 +249,27 @@ public class PeerSync implements SolrMetricProducer {
     }
   }
 
+  private PeerSyncResult failOnNoVersions() {
+    log.info("{} DONE. We have no versions. sync failed.", msg());
+
+    for (;;)  {
+      ShardResponse srsp = shardHandler.takeCompletedIncludingErrors();
+      if (srsp == null) break;
+      if (srsp.getException() == null)  {
+        if (log.isDebugEnabled()) log.debug("checking if others have versions {} {}", srsp.getSolrResponse().getResponse());
+        List<Long> otherVersions = (List<Long>)srsp.getSolrResponse().getResponse().get("versions");
+        if (otherVersions != null && !otherVersions.isEmpty())  {
+          if (syncErrors != null) syncErrors.inc();
+          if (log.isDebugEnabled()) log.debug("found another replica with versions");
+          return PeerSyncResult.failure(true);
+        }
+      }
+    }
+    if (syncErrors != null) syncErrors.inc();
+    if (log.isDebugEnabled()) log.debug("found no other replica with versions");
+    return PeerSyncResult.failure(false);
+  }
+
   /**
    * Check if we are already in sync. Simple fingerprint comparison should do
    */
@@ -406,31 +411,6 @@ public class PeerSync implements SolrMetricProducer {
     }
   }
 
-  private boolean canHandleVersionRanges(String replica) {
-    SyncShardRequest sreq = new SyncShardRequest();
-    requests.add(sreq);
-
-    // determine if leader can handle version ranges
-    sreq.shards = new String[] {replica};
-    sreq.actualShards = sreq.shards;
-    sreq.params = new ModifiableSolrParams();
-    sreq.params.set("qt", "/get");
-    sreq.params.set(DISTRIB, false);
-    sreq.params.set("checkCanHandleVersionRanges", false);
-
-    ShardHandler sh = shardHandlerFactory.getShardHandler();
-    sh.submit(sreq, replica, sreq.params);
-
-    ShardResponse srsp = sh.takeCompletedIncludingErrors();
-    Boolean canHandleVersionRanges = srsp.getSolrResponse().getResponse().getBooleanArg("canHandleVersionRanges");
-
-    if (canHandleVersionRanges == null || canHandleVersionRanges.booleanValue() == false) {
-      return false;
-    }
-
-    return true;
-  }
-
   private boolean handleVersions(ShardResponse srsp, MissedUpdatesFinder missedUpdatesFinder) {
     // we retrieved the last N updates from the replica
     @SuppressWarnings({"unchecked"})
@@ -440,8 +420,8 @@ public class PeerSync implements SolrMetricProducer {
     SyncShardRequest sreq = (SyncShardRequest) srsp.getShardRequest();
     Object fingerprint = srsp.getSolrResponse().getResponse().get("fingerprint");
 
-    if (log.isInfoEnabled()) {
-      log.info("{} Received {} versions from {} {} fingerprint:{}", msg(), otherVersions.size(), otherVersions, sreq.shards[0], fingerprint);
+    if (log.isDebugEnabled()) {
+      log.debug("{} Received {} versions from {} {} fingerprint:{}", msg(), otherVersions.size(), otherVersions, sreq.shards[0], fingerprint);
     }
     if (fingerprint != null) {
       sreq.fingerprint = IndexFingerprint.fromObject(fingerprint);
@@ -454,7 +434,7 @@ public class PeerSync implements SolrMetricProducer {
     
     MissedUpdatesRequest updatesRequest = missedUpdatesFinder.find(
         otherVersions, sreq.shards[0],
-        () -> core.getSolrConfig().useRangeVersionsForPeerSync && canHandleVersionRanges(sreq.shards[0]));
+        () -> core.getSolrConfig().useRangeVersionsForPeerSync);
 
     if (updatesRequest == MissedUpdatesRequest.ALREADY_IN_SYNC) {
       return true;
diff --git a/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java b/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
index b670372..2a480a6 100644
--- a/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
+++ b/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
@@ -171,7 +171,10 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
 
       // now make sure that the starting updates overlap our updates
       // there shouldn't be reorders, so any overlap will do.
-      long smallestNewUpdate = Math.abs(ourUpdates.get(ourUpdates.size() - 1));
+      long smallestNewUpdate = 0;
+      if (ourUpdates.size() > 0) {
+        smallestNewUpdate = Math.abs(ourUpdates.get(ourUpdates.size() - 1));
+      }
 
       if (Math.abs(startingVersions.get(0)) < smallestNewUpdate) {
         log.warn("{} too many updates received since start - startingUpdates no longer overlaps with our currentUpdates", msg());
@@ -298,7 +301,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
         // only DBI or DBQ in the gap (above) will satisfy this predicate
         return version > leaderFingerprint.getMaxVersionEncountered() && (oper == UpdateLog.DELETE || oper == UpdateLog.DELETE_BY_QUERY);
       });
-      log.info("existDBIOrDBQInTheGap={}", existDBIOrDBQInTheGap);
+      if (log.isDebugEnabled()) log.debug("existDBIOrDBQInTheGap={}", existDBIOrDBQInTheGap);
       if (!existDBIOrDBQInTheGap) {
         // it is safe to use leaderFingerprint.maxVersionEncountered as cut point now.
         updates.removeIf(e -> {
@@ -306,7 +309,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer {
           List<Object> u = (List<Object>) e;
           long version = (Long) u.get(1);
           boolean success = version > leaderFingerprint.getMaxVersionEncountered();
-          log.info("existDBIOrDBQInTheGap version={}  leaderFingerprint.getMaxVersionEncountered={} success={}", version, leaderFingerprint.getMaxVersionEncountered(), success);
+          if (log.isDebugEnabled()) log.debug("existDBIOrDBQInTheGap version={}  leaderFingerprint.getMaxVersionEncountered={} success={}", version, leaderFingerprint.getMaxVersionEncountered(), success);
           return success;
         });
       }
diff --git a/solr/core/src/java/org/apache/solr/update/SolrCoreState.java b/solr/core/src/java/org/apache/solr/update/SolrCoreState.java
index c53fe21..b0885ef 100644
--- a/solr/core/src/java/org/apache/solr/update/SolrCoreState.java
+++ b/solr/core/src/java/org/apache/solr/update/SolrCoreState.java
@@ -70,7 +70,7 @@ public abstract class SolrCoreState {
     boolean close = false;
     synchronized (this) {
       solrCoreStateRefCnt--;
-      log.info("SolrCoreState ref count {}", solrCoreStateRefCnt);
+      if (log.isDebugEnabled()) log.debug("SolrCoreState ref count {}", solrCoreStateRefCnt);
 
       if (solrCoreStateRefCnt == 0) {
         closed = true;
@@ -80,7 +80,7 @@ public abstract class SolrCoreState {
     
     if (close) {
       try {
-        log.debug("Closing SolrCoreState");
+        if (log.isDebugEnabled()) log.debug("Closing SolrCoreState");
         close(closer);
       } catch (Exception e) {
         log.error("Error closing SolrCoreState", e);
diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java b/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
index 3e55dab..d66f777 100644
--- a/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
+++ b/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
@@ -291,7 +291,7 @@ public class SolrIndexSplitter {
           t = timings.sub("createSubIW");
           t.resume();
           iw = SolrIndexWriter.buildIndexWriter(core, partitionName, path, core.getDirectoryFactory(), true, core.getLatestSchema(),
-                  core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec());
+                  core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec(), true);
           t.pause();
         }
       }
diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java
index 2674487..d710db8 100644
--- a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java
+++ b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java
@@ -112,12 +112,12 @@ public class SolrIndexWriter extends IndexWriter {
 //    return w;
 //  }
 
-  public static SolrIndexWriter buildIndexWriter(SolrCore core, String name, String path, DirectoryFactory directoryFactory, boolean create, IndexSchema schema, SolrIndexConfig config, IndexDeletionPolicy delPolicy, Codec codec) {
+  public static SolrIndexWriter buildIndexWriter(SolrCore core, String name, String path, DirectoryFactory directoryFactory, boolean create, IndexSchema schema, SolrIndexConfig config, IndexDeletionPolicy delPolicy, Codec codec, boolean commitOnClose) {
     SolrIndexWriter iw = null;
     Directory dir = null;
     try {
       dir = getDir(directoryFactory, path, config);
-      iw = new SolrIndexWriter(core, name, directoryFactory, dir, create, schema, config, delPolicy, codec);
+      iw = new SolrIndexWriter(core, name, directoryFactory, dir, create, schema, config, delPolicy, codec, commitOnClose);
     } catch (Throwable e) {
       ParWork.propagateInterrupt(e);
       SolrException exp = new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
@@ -170,11 +170,11 @@ public class SolrIndexWriter extends IndexWriter {
     assert ObjectReleaseTracker.track(this);
   }
 
-  public SolrIndexWriter(SolrCore core, String name, DirectoryFactory directoryFactory, Directory directory, boolean create, IndexSchema schema, SolrIndexConfig config, IndexDeletionPolicy delPolicy, Codec codec) throws IOException {
+  public SolrIndexWriter(SolrCore core, String name, DirectoryFactory directoryFactory, Directory directory, boolean create, IndexSchema schema, SolrIndexConfig config, IndexDeletionPolicy delPolicy, Codec codec, boolean commitOnClose) throws IOException {
     super(directory,
             config.toIndexWriterConfig(core).
                     setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND).
-                    setIndexDeletionPolicy(delPolicy).setCodec(codec)
+                    setIndexDeletionPolicy(delPolicy).setCodec(codec).setCommitOnClose(commitOnClose)
     );
     try {
     if (log.isDebugEnabled()) log.debug("Opened Writer " + name);
diff --git a/solr/core/src/java/org/apache/solr/update/UpdateLog.java b/solr/core/src/java/org/apache/solr/update/UpdateLog.java
index 39fa5de..2718286 100644
--- a/solr/core/src/java/org/apache/solr/update/UpdateLog.java
+++ b/solr/core/src/java/org/apache/solr/update/UpdateLog.java
@@ -1596,25 +1596,27 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
           if(Math.abs(ptr.version) > Math.abs(maxVersion)) continue;
           if (ptr.version != 0) {
             ret.add(ptr.version);
+          } else {
+            log.warn("Found version of 0 {} {} {}", ptr.pointer, ptr.previousVersion, ptr.log);
           }
           if (--n <= 0) return ret;
         }
       }
-      log.info("Return getVersions {} {}", n, ret);
+      if (log.isDebugEnabled()) log.debug("Return getVersions {} {}", n, ret);
 
       return ret;
     }
 
     public Object lookup(long version) {
-      log.info("lookup {}", version);
+      if (log.isDebugEnabled()) log.debug("lookup {}", version);
       Update update = updates.get(version);
       if (update == null) return null;
 
-      log.info("found update from updates {} {}", update.version, updates.size());
+      if (log.isDebugEnabled()) log.debug("found update from updates {} {}", update.version, updates.size());
 
       Object object = update.log.lookup(update.pointer);
 
-      log.info("found update from log {} {} ptr={} object={}", update.version, update.log, update.pointer, object);
+      if (log.isDebugEnabled()) log.debug("found update from log {} {} ptr={} object={}", update.version, update.log, update.pointer, object);
 
       return object;
     }
@@ -1704,7 +1706,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
             numUpdates++;
           }
 
-          log.info("Recent updates updates numUpdates={} numUpdatesToKeep={}", numUpdates, numRecordsToKeep);
+          if (log.isDebugEnabled()) log.debug("Recent updates updates numUpdates={} numUpdatesToKeep={}", numUpdates, numRecordsToKeep);
 
         } catch (IOException | AssertionError e) { // catch AssertionError to handle certain test failures correctly
           // failure to read a log record isn't fatal
diff --git a/solr/core/src/java/org/apache/solr/update/UpdateShardHandler.java b/solr/core/src/java/org/apache/solr/update/UpdateShardHandler.java
index ecdf3b6..0f7800e 100644
--- a/solr/core/src/java/org/apache/solr/update/UpdateShardHandler.java
+++ b/solr/core/src/java/org/apache/solr/update/UpdateShardHandler.java
@@ -108,7 +108,7 @@ public class UpdateShardHandler implements SolrInfoBean {
           .connectionTimeout(cfg.getDistributedConnectionTimeout())
           .idleTimeout(cfg.getDistributedSocketTimeout());
     }
-    updateOnlyClient = updateOnlyClientBuilder.markInternalRequest().strictEventOrdering(true).build();
+    updateOnlyClient = updateOnlyClientBuilder.markInternalRequest().strictEventOrdering(false).build();
     updateOnlyClient.enableCloseLock();
    // updateOnlyClient.addListenerFactory(updateHttpListenerFactory);
     Set<String> queryParams = new HashSet<>(2);
diff --git a/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
index a5a8847..3493dac 100644
--- a/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
+++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
@@ -190,7 +190,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
             EnumSet.of(Replica.Type.TLOG, Replica.Type.NRT), true);
 
         try {
-          leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId(), 1000, false);
+          leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId(), 3000, false);
         } catch (Exception e) {
           ParWork.propagateInterrupt(e);
           throw new SolrException(ErrorCode.SERVER_ERROR,
@@ -645,7 +645,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
           + "failed since we're not in cloud mode.");
     }
     try {
-      return zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId(), 1500, false).getCoreUrl();
+      return zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId(), 3000, false).getCoreUrl();
     } catch (InterruptedException | TimeoutException e) {
       ParWork.propagateInterrupt(e);
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Exception during fetching from leader.", e);
@@ -717,14 +717,14 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
     try {
       // Not equivalent to getLeaderProps, which  retries to find a leader.
       // Replica leader = slice.getLeader();
-      Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 100, false);
+      Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 3000, false);
       isLeader = leaderReplica.getName().equals(desc.getName());
       if (log.isTraceEnabled()) log.trace("Are we leader for sending to replicas? {} phase={}", isLeader, phase);
       if (!isLeader) {
         isSubShardLeader = amISubShardLeader(coll, slice, id, doc);
         if (isSubShardLeader) {
           shardId = cloudDesc.getShardId();
-          leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 1500, false);
+          leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 3000, false);
         }
       }
 
diff --git a/solr/core/src/test/org/apache/solr/cloud/MissingSegmentRecoveryTest.java b/solr/core/src/test/org/apache/solr/cloud/MissingSegmentRecoveryTest.java
index c0907f2..addc405 100644
--- a/solr/core/src/test/org/apache/solr/cloud/MissingSegmentRecoveryTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/MissingSegmentRecoveryTest.java
@@ -38,11 +38,13 @@ import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 @Slow
+@Ignore // nocommit - this feature needs a little work
 public class MissingSegmentRecoveryTest extends SolrCloudTestCase {
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
diff --git a/solr/server/etc/jetty-https.xml b/solr/server/etc/jetty-https.xml
index 331cb3d..c12b4f6 100644
--- a/solr/server/etc/jetty-https.xml
+++ b/solr/server/etc/jetty-https.xml
@@ -79,7 +79,7 @@
         <Set name="acceptQueueSize"><Property name="solr.jetty.https.acceptQueueSize" default="4096"/></Set>
         <Call name="addLifeCycleListener">
           <Arg>
-            <New class="org.apache.solr.servlet.SolrConnectorListener"/>
+            <New class="org.apache.solr.servlet.SolrLifcycleListener"/>
           </Arg>
         </Call>
       </New>
diff --git a/solr/server/etc/jetty-https8.xml b/solr/server/etc/jetty-https8.xml
index f937852..9116a36 100644
--- a/solr/server/etc/jetty-https8.xml
+++ b/solr/server/etc/jetty-https8.xml
@@ -62,6 +62,11 @@
         <Set name="idleTimeout"><Property name="solr.jetty.https.timeout" default="120000"/></Set>
         <Set name="acceptorPriorityDelta"><Property name="solr.jetty.ssl.acceptorPriorityDelta" default="0"/></Set>
         <Set name="acceptQueueSize"><Property name="solr.jetty.https.acceptQueueSize" default="0"/></Set>
+        <Call name="addLifeCycleListener">
+          <Arg>
+            <New class="org.apache.solr.servlet.SolrLifcycleListener"/>
+          </Arg>
+        </Call>
       </New>
     </Arg>
   </Call>
diff --git a/solr/server/etc/jetty.xml b/solr/server/etc/jetty.xml
index 438eb36..0c43ba9 100644
--- a/solr/server/etc/jetty.xml
+++ b/solr/server/etc/jetty.xml
@@ -152,10 +152,6 @@
            <Set name="handlers">
              <Array type="org.eclipse.jetty.server.Handler">
                <Item>
-                 <New id="ShutdownHandler" class="org.apache.solr.servlet.SolrShutdownHandler">
-                 </New>
-               </Item>
-               <Item>
                  <New class="org.eclipse.jetty.server.handler.InetAccessHandler">
                    <Call name="include">
                      <Arg>
diff --git a/solr/server/resources/log4j2.xml b/solr/server/resources/log4j2.xml
index 6ac550a..ae030fd 100644
--- a/solr/server/resources/log4j2.xml
+++ b/solr/server/resources/log4j2.xml
@@ -39,9 +39,9 @@
       </PatternLayout>
       <Policies>
         <OnStartupTriggeringPolicy />
-        <SizeBasedTriggeringPolicy size="32 MB"/>
+        <SizeBasedTriggeringPolicy size="64 MB"/>
       </Policies>
-      <DefaultRolloverStrategy max="10"/>
+      <DefaultRolloverStrategy max="20"/>
     </RollingRandomAccessFile>
 
     <RollingRandomAccessFile
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/Http2SolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/Http2SolrClient.java
index 945fb62..e0de06e 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/Http2SolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/Http2SolrClient.java
@@ -83,6 +83,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.lang.invoke.MethodHandles;
+import java.lang.management.ManagementFactory;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URL;
@@ -116,6 +117,9 @@ import java.util.concurrent.TimeoutException;
  * @lucene.experimental
  */
 public class Http2SolrClient extends SolrClient {
+
+  public static final int PROC_COUNT = ManagementFactory.getOperatingSystemMXBean().getAvailableProcessors();
+
   public static final String REQ_PRINCIPAL_KEY = "solr-req-principal";
 
   private static volatile SSLConfig defaultSSLConfig;
@@ -218,7 +222,7 @@ public class Http2SolrClient extends SolrClient {
       ssl = true;
     }
     // nocommit - look at config again as well
-    int minThreads = Integer.getInteger("solr.minHttp2ClientThreads", 6);
+    int minThreads = Integer.getInteger("solr.minHttp2ClientThreads", PROC_COUNT);
     httpClientExecutor = new SolrQueuedThreadPool("http2Client", builder.maxThreadPoolSize, minThreads,
         this.headers != null && this.headers.containsKey(QoSParams.REQUEST_SOURCE) && this.headers.get(QoSParams.REQUEST_SOURCE).equals(QoSParams.INTERNAL) ? 3000 : 5000,
         null, -1, null);
@@ -1101,7 +1105,7 @@ public class Http2SolrClient extends SolrClient {
 
   public static class Builder {
 
-    public int maxThreadPoolSize = Integer.getInteger("solr.maxHttp2ClientThreads", 512);
+    public int maxThreadPoolSize = Integer.getInteger("solr.maxHttp2ClientThreads", PROC_COUNT * 2);
     public int maxRequestsQueuedPerDestination = 1600;
     private Http2SolrClient http2SolrClient;
     private SSLConfig sslConfig = defaultSSLConfig;
diff --git a/solr/solrj/src/java/org/apache/solr/common/ParWork.java b/solr/solrj/src/java/org/apache/solr/common/ParWork.java
index 0627e65..42b3827 100644
--- a/solr/solrj/src/java/org/apache/solr/common/ParWork.java
+++ b/solr/solrj/src/java/org/apache/solr/common/ParWork.java
@@ -73,15 +73,15 @@ public class ParWork implements Closeable {
 
   private static volatile ParWorkExecutor EXEC;
 
-  // pretty much don't use it
   public static ParWorkExecutor getRootSharedExecutor() {
     if (EXEC == null) {
       synchronized (ParWork.class) {
         if (EXEC == null) {
           EXEC = (ParWorkExecutor) getParExecutorService("RootExec",
-              Integer.getInteger("solr.rootSharedThreadPoolCoreSize", 15), Integer.MAX_VALUE, 1000,
+              Integer.getInteger("solr.rootSharedThreadPoolCoreSize", 200), Integer.MAX_VALUE, 1000,
               new SynchronousQueue());
           ((ParWorkExecutor)EXEC).enableCloseLock();
+          EXEC.prestartAllCoreThreads();
         }
       }
     }
@@ -89,10 +89,12 @@ public class ParWork implements Closeable {
   }
 
   public static void shutdownParWorkExecutor() {
-    try {
-      shutdownParWorkExecutor(EXEC, true);
-    } finally {
-      EXEC = null;
+    synchronized (ParWork.class) {
+      try {
+        shutdownParWorkExecutor(EXEC, true);
+      } finally {
+        EXEC = null;
+      }
     }
   }
 
@@ -496,7 +498,7 @@ public class ParWork implements Closeable {
         Integer minThreads;
         Integer maxThreads;
         minThreads = 4;
-        maxThreads = PROC_COUNT;
+        maxThreads = PROC_COUNT / 2;
         exec = getExecutorService(Math.max(minThreads, maxThreads)); // keep alive directly affects how long a worker might
        // ((PerThreadExecService)exec).closeLock(true);
         // be stuck in poll without an enqueue on shutdown
diff --git a/solr/solrj/src/java/org/apache/solr/common/PerThreadExecService.java b/solr/solrj/src/java/org/apache/solr/common/PerThreadExecService.java
index f8ccd29..269da51 100644
--- a/solr/solrj/src/java/org/apache/solr/common/PerThreadExecService.java
+++ b/solr/solrj/src/java/org/apache/solr/common/PerThreadExecService.java
@@ -206,22 +206,20 @@ public class PerThreadExecService extends AbstractExecutorService {
     return maxSize;
   }
 
-  public boolean checkLoad() {
+  private boolean checkLoad() {
 
-    double ourLoad = ParWork.getSysStats().getTotalUsage();
-    if (ourLoad > SysStats.OUR_LOAD_HIGH) {
-      if (log.isDebugEnabled()) log.debug("Our cpu usage is too high, run in caller thread {}", ourLoad);
+    double sLoad = sysStats.getSystemLoad();
+
+    if (hiStateLoad(sLoad)) {
       return false;
-    } else {
-      double sLoad = sysStats.getSystemLoad();
-      if (sLoad > 1) {
-        if (log.isDebugEnabled()) log.debug("System load is too high, run in caller thread {}", sLoad);
-        return false;
-      }
     }
     return true;
   }
-  
+
+  private boolean hiStateLoad(double sLoad) {
+    return sLoad > 8.0d && running.get() > 3;
+  }
+
   public void closeLock(boolean lock) {
     if (lock) {
       closeTracker.enableCloseLock();
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZooKeeper.java b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZooKeeper.java
index 9338679..f3fc018 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZooKeeper.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZooKeeper.java
@@ -104,26 +104,26 @@ public class SolrZooKeeper extends ZooKeeper {
   @Override
   public void close() {
     if (closeTracker != null) closeTracker.close();
-//    try {
-//      try {
-//        RequestHeader h = new RequestHeader();
-//        h.setType(ZooDefs.OpCode.closeSession);
-//
-//        cnxn.submitRequest(h, null, null, null);
-//      } catch (InterruptedException e) {
-//        // ignore, close the send/event threads
-//      } finally {
-//        ZooKeeperExposed zk = new ZooKeeperExposed(this, cnxn);
-//        zk.closeCnxn();
-//      }
-//    } catch (Exception e) {
-//      log.warn("Exception closing zookeeper client", e);
-//    }
     try {
-      super.close();
-    } catch (InterruptedException e) {
-      e.printStackTrace();
+      try {
+        RequestHeader h = new RequestHeader();
+        h.setType(ZooDefs.OpCode.closeSession);
+
+        cnxn.submitRequest(h, null, null, null);
+      } catch (InterruptedException e) {
+        // ignore, close the send/event threads
+      } finally {
+        ZooKeeperExposed zk = new ZooKeeperExposed(this, cnxn);
+        zk.closeCnxn();
+      }
+    } catch (Exception e) {
+      log.warn("Exception closing zookeeper client", e);
     }
+//    try {
+//      super.close();
+//    } catch (InterruptedException e) {
+//      e.printStackTrace();
+//    }
   }
 
 }
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
index f4e7dd6..ff463e3 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
@@ -968,33 +968,6 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
    * Get shard leader properties, with retry if none exist.
    */
   public Replica getLeaderRetry(String collection, String shard, int timeout, boolean mustBeLive) throws InterruptedException, TimeoutException {
-
-    DocCollection coll = getClusterState().getCollectionOrNull(collection);
-
-    if (coll != null) {
-      Slice slice = coll.getSlice(shard);
-      if (slice != null) {
-        Replica leader = slice.getLeader();
-        boolean valid = false;
-        try {
-          valid = mustBeLive ? leader != null && isNodeLive(leader.getNodeName()) || zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + slice.getName() + "/leader") : leader != null && isNodeLive(leader.getNodeName());
-        } catch (KeeperException e) {
-
-        } catch (InterruptedException e) {
-
-        }
-        if (leader != null && leader.getState() == Replica.State.ACTIVE && valid) {
-          return leader;
-        }
-        Collection<Replica> replicas = slice.getReplicas();
-        for (Replica replica : replicas) {
-          if ("true".equals(replica.getProperty(LEADER_PROP)) && replica.getState() == Replica.State.ACTIVE && valid) {
-            return replica;
-          }
-        }
-      }
-    }
-
     AtomicReference<Replica> returnLeader = new AtomicReference<>();
     try {
       waitForState(collection, timeout, TimeUnit.MILLISECONDS, (n, c) -> {
@@ -1002,26 +975,40 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
           return false;
         Slice slice = c.getSlice(shard);
         if (slice == null) return false;
+        Replica zkLeader = null;
         Replica leader = slice.getLeader();
-        boolean valid = false;
-        try {
-          valid = mustBeLive ?  (leader != null && isNodeLive(leader.getNodeName())) ||
-              zkClient.exists(COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + slice.getName()  + "/leader") :
-              leader != null && isNodeLive(leader.getNodeName());
-        } catch (KeeperException e) {
-          return false;
-        } catch (InterruptedException e) {
-          return false;
-        }
-        if (leader != null && leader.getState() == Replica.State.ACTIVE && valid) {
-          returnLeader.set(leader);
-          return true;
+        if (leader != null && leader.getState() == Replica.State.ACTIVE) {
+          if (isNodeLive(leader.getNodeName())) {
+            returnLeader.set(leader);
+            return true;
+          }
+
+          if (!mustBeLive) {
+            if (zkLeader == null) {
+              zkLeader = getLeaderProps(collection, shard);
+            }
+            if (zkLeader != null && zkLeader.getName().equals(leader.getName())) {
+              returnLeader.set(leader);
+              return true;
+            }
+          }
         }
         Collection<Replica> replicas = slice.getReplicas();
         for (Replica replica : replicas) {
-          if ("true".equals(replica.getProperty(LEADER_PROP)) && replica.getState() == Replica.State.ACTIVE && valid) {
-            returnLeader.set(replica);
-            return true;
+          if ("true".equals(replica.getProperty(LEADER_PROP)) && replica.getState() == Replica.State.ACTIVE) {
+            if (isNodeLive(replica.getNodeName())) {
+              returnLeader.set(replica);
+              return true;
+            }
+            if (!mustBeLive) {
+              if (zkLeader == null) {
+                zkLeader = getLeaderProps(collection, shard);
+              }
+              if (zkLeader != null && zkLeader.getName().equals(replica.getName())) {
+                returnLeader.set(replica);
+                return true;
+              }
+            }
           }
         }
 
@@ -1035,6 +1022,25 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
     return returnLeader.get();
   }
 
+  public Replica getLeaderProps(final String collection, final String slice) {
+
+    try {
+      byte[] data = zkClient.getData(ZkStateReader.getShardLeadersPath(collection, slice), null, null);
+      ZkCoreNodeProps leaderProps = new ZkCoreNodeProps(ZkNodeProps.load(data));
+      String name = leaderProps.getNodeProps().getStr(ZkStateReader.CORE_NAME_PROP);
+      leaderProps.getNodeProps().getProperties().remove(ZkStateReader.CORE_NAME_PROP);
+      // nocommit - right key for leader name?
+      return new Replica(name, leaderProps.getNodeProps().getProperties(), collection, slice, this);
+
+    } catch (KeeperException.NoNodeException e) {
+      return null;
+    } catch (Exception e) {
+      SolrZkClient.checkInterrupted(e);
+      throw new SolrException(ErrorCode.SERVER_ERROR, e);
+    }
+
+  }
+
   /**
    * Get path where shard leader properties live in zookeeper.
    */
@@ -1594,6 +1600,8 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
           work.collect("", () -> {
             try {
               zk.removeWatches(getCollectionSCNPath(coll), this, WatcherType.Any, true);
+            } catch (KeeperException.NoWatcherException e) {
+
             } catch (Exception e) {
               log.info("could not remove watch {} {}", e.getClass().getSimpleName(), e.getMessage());
             }
@@ -1602,6 +1610,8 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
             work.collect("", () -> {
               try {
                 zk.removeWatches(getCollectionStateUpdatesPath(coll), watcher, WatcherType.Any, true);
+              } catch (KeeperException.NoWatcherException e) {
+
               } catch (Exception e) {
                 log.info("could not remove watch {} {}", e.getClass().getSimpleName(), e.getMessage());
               }
diff --git a/solr/solrj/src/java/org/apache/solr/common/util/SysStats.java b/solr/solrj/src/java/org/apache/solr/common/util/SysStats.java
index 05417d2..75a7a17 100644
--- a/solr/solrj/src/java/org/apache/solr/common/util/SysStats.java
+++ b/solr/solrj/src/java/org/apache/solr/common/util/SysStats.java
@@ -19,8 +19,8 @@ public class SysStats extends Thread {
     private static final Logger log = LoggerFactory
         .getLogger(MethodHandles.lookup().lookupClass());
 
-    public static final double OUR_LOAD_HIGH = 1.0;
-    public static final long REFRESH_INTERVAL = TimeUnit.NANOSECONDS.convert(5000, TimeUnit.MILLISECONDS);
+    public static final double OUR_LOAD_HIGH = 3.0;
+    public static final long REFRESH_INTERVAL = TimeUnit.NANOSECONDS.convert(2500, TimeUnit.MILLISECONDS);
     public static final int PROC_COUNT = ManagementFactory.getOperatingSystemMXBean().getAvailableProcessors();
     private final long refreshIntervalMs;
 
@@ -89,12 +89,12 @@ public class SysStats extends Thread {
                     threadTime.setLast(threadBean.getThreadCpuTime(threadTime.getId()));
                 }
 
-                double load =  ManagementFactory.getOperatingSystemMXBean().getSystemLoadAverage();
-                if (load < 0) {
-                    log.warn("SystemLoadAverage not supported on this JVM");
-                } else {
-                    sysLoad = load / (double) PROC_COUNT;
-                }
+//                double load =  ManagementFactory.getOperatingSystemMXBean().getSystemLoadAverage();
+//                if (load < 0) {
+//                    log.warn("SystemLoadAverage not supported on this JVM");
+//                } else {
+//                    sysLoad = load / (double) PROC_COUNT;
+//                }
 
             } else {
                 double load =  ManagementFactory.getOperatingSystemMXBean().getSystemLoadAverage();
@@ -162,6 +162,13 @@ public class SysStats extends Thread {
     }
 
     public double getSystemLoad() {
+        double sysLoad = -1;
+        double load = ManagementFactory.getOperatingSystemMXBean().getSystemLoadAverage();
+        if (load < 0) {
+            log.warn("SystemLoadAverage not supported on this JVM");
+        } else {
+            sysLoad = load / (double) PROC_COUNT;
+        }
         return sysLoad;
     }
 
diff --git a/solr/solrj/src/java/org/apache/zookeeper/ZooKeeperExposed.java b/solr/solrj/src/java/org/apache/zookeeper/ZooKeeperExposed.java
index 01d144c..e890259 100644
--- a/solr/solrj/src/java/org/apache/zookeeper/ZooKeeperExposed.java
+++ b/solr/solrj/src/java/org/apache/zookeeper/ZooKeeperExposed.java
@@ -64,7 +64,7 @@ public class ZooKeeperExposed {
         clientCnxn.sendThread.close();
 
         try {
-            clientCnxn.sendThread.join(20);
+            clientCnxn.sendThread.join(50);
         } catch (InterruptedException e) {
         }