You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2020/11/16 17:10:38 UTC

[lucene-solr] branch reference_impl_dev updated (a96f7e6 -> 61105ed)

This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a change to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git.


    from a96f7e6  @1227 Tweak core state locking.
     new 02e3a6e  @1228 Tweaks.
     new 607c28a  @1229 Make sure recovery can bail from it's longer retries.
     new 61105ed  @1230 Fast response to close.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../client/solrj/embedded/JettySolrRunner.java     |   2 +-
 .../org/apache/solr/cloud/RecoveryStrategy.java    |  39 +-
 .../java/org/apache/solr/cloud/ZkController.java   | 402 +++++++++++----------
 .../apache/solr/cloud/overseer/ZkStateWriter.java  |   2 +-
 .../java/org/apache/solr/handler/IndexFetcher.java |  11 +-
 .../solr/handler/admin/ShowFileRequestHandler.java |   2 +-
 .../solr/handler/component/CloudReplicaSource.java |   3 +
 .../processor/DistributedZkUpdateProcessor.java    |  13 +-
 .../apache/solr/cloud/BasicDistributedZkTest.java  |   2 +-
 .../solr/cloud/LeaderElectionIntegrationTest.java  |   3 +-
 .../org/apache/solr/cloud/ReplaceNodeTest.java     |  10 +-
 .../apache/solr/common/cloud/ZkStateReader.java    |  12 +-
 .../src/java/org/apache/solr/SolrTestCase.java     |   5 +-
 .../solr/cloud/AbstractFullDistribZkTestBase.java  |   2 +-
 .../src/resources/logconf/log4j2-std-debug.xml     |   8 +-
 15 files changed, 292 insertions(+), 224 deletions(-)


[lucene-solr] 01/03: @1228 Tweaks.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 02e3a6e587580cafb2f1d45b51186c8b4cb060f0
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Mon Nov 16 10:03:21 2020 -0600

    @1228 Tweaks.
---
 .../client/solrj/embedded/JettySolrRunner.java     |  2 +-
 .../org/apache/solr/cloud/RecoveryStrategy.java    | 26 +++++++++++++++-------
 .../java/org/apache/solr/cloud/ZkController.java   | 21 +++++++++++++----
 .../apache/solr/cloud/overseer/ZkStateWriter.java  |  2 +-
 .../solr/handler/admin/ShowFileRequestHandler.java |  2 +-
 .../src/java/org/apache/solr/SolrTestCase.java     |  5 ++++-
 .../src/resources/logconf/log4j2-std-debug.xml     |  8 +++----
 7 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java b/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
index 35ce796..d0ca4ec 100644
--- a/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
+++ b/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
@@ -867,7 +867,7 @@ public class JettySolrRunner implements Closeable {
     @Override
     public void service(HttpServletRequest req, HttpServletResponse res)
         throws IOException {
-      res.sendError(404, "Can not find: " + req.getRequestURI());
+      res.sendError(404, "Can not find (404 from jetty Solr Runner): " + req.getRequestURI());
     }
   }
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index 92127a1..4c20928 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -110,7 +110,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
       .getInteger("solr.cloud.wait-for-updates-with-stale-state-pause", 0);
   private volatile int maxRetries = 500;
   private volatile int startingRecoveryDelayMilliSeconds = Integer
-          .getInteger("solr.cloud.starting-recovery-delay-milli-seconds", 1000);
+          .getInteger("solr.cloud.starting-recovery-delay-milli-seconds", 0);
 
   public static interface RecoveryListener {
     public void recovered();
@@ -770,16 +770,26 @@ public class RecoveryStrategy implements Runnable, Closeable {
           // Wait an exponential interval between retries, start at 2 seconds and work up to a minute.
           // Since we sleep at 2 seconds sub-intervals in
           // order to check if we were closed, 30 is chosen as the maximum loopCount (2s * 30 = 1m).
-          double loopCount = Math.min(Math.pow(2, retries.get() - 1), 30);
-          log.info("Wait [{}] seconds before trying to recover again (attempt={})",
-              loopCount * startingRecoveryDelayMilliSeconds, retries);
-          for (int i = 0; i < loopCount; i++) {
+
+
+
             if (isClosed()) {
               log.info("RecoveryStrategy has been closed");
-              break; // check if someone closed us
+              return;
             }
-            Thread.sleep(startingRecoveryDelayMilliSeconds);
-          }
+
+            long wait = startingRecoveryDelayMilliSeconds;
+
+            if (retries.get() > 1 && retries.get() < 10) {
+              wait = (Math.max(500, startingRecoveryDelayMilliSeconds)) * retries.get();
+            } else if (retries.get() > 0) {
+              wait = TimeUnit.SECONDS.toMillis(60);
+            }
+
+            log.info("Wait [{}] seconds before trying to recover again (attempt={})", wait, retries);
+
+            Thread.sleep(wait);
+
         } catch (InterruptedException e) {
           ParWork.propagateInterrupt(e, true);
           return;
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 0cd1b65..0fef5a2 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -19,6 +19,7 @@ package org.apache.solr.cloud;
 import com.google.common.base.Strings;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.solr.client.solrj.cloud.DistributedLock;
+import org.apache.solr.client.solrj.cloud.LockListener;
 import org.apache.solr.client.solrj.cloud.SolrCloudManager;
 import org.apache.solr.client.solrj.impl.CloudHttp2SolrClient;
 import org.apache.solr.client.solrj.impl.SolrClientCloudManager;
@@ -1087,10 +1088,22 @@ public class ZkController implements Closeable, Runnable {
       }
       boolean createdClusterNodes = false;
       try {
-        DistributedLock lock = new DistributedLock(zkClient, "/cluster/cluster_lock", zkClient.getZkACLProvider().getACLsToAdd("/cluster/cluster_lock"));
+        CountDownLatch lockWaitLatch = new CountDownLatch(1);
+        DistributedLock lock = new DistributedLock(zkClient, "/cluster/cluster_lock", zkClient.getZkACLProvider().getACLsToAdd("/cluster/cluster_lock"),
+        new LockListener(){
+          @Override
+          public void lockAcquired() {
+            lockWaitLatch.countDown();
+          }
+
+          @Override
+          public void lockReleased() {
+
+          }
+        });
         if (log.isDebugEnabled()) log.debug("get cluster lock");
-        while (!lock.lock()) {
-          Thread.sleep(150);
+        if (!lock.lock()) {
+          lockWaitLatch.await();
         }
         try {
 
@@ -1169,7 +1182,7 @@ public class ZkController implements Closeable, Runnable {
 
         } finally {
           if (log.isDebugEnabled()) log.debug("release cluster lock");
-          if (lock.isOwner()) lock.unlock();
+          lock.unlock();
         }
         if (!createdClusterNodes) {
           // wait?
diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java
index b2ee8f4..54af3a7 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java
@@ -75,7 +75,7 @@ public class ZkStateWriter {
   protected final ReentrantLock ourLock = new ReentrantLock(true);
   protected final ReentrantLock writeLock = new ReentrantLock(true);
 
-  private final ActionThrottle throttle = new ActionThrottle("ZkStateWriter", 50, new TimeSource.NanoTimeSource(){
+  private final ActionThrottle throttle = new ActionThrottle("ZkStateWriter", Integer.getInteger("solr.zkstatewriter.throttle", 50), new TimeSource.NanoTimeSource(){
     public void sleep(long ms) throws InterruptedException {
       ourLock.newCondition().await(ms, TimeUnit.MILLISECONDS);
     }
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/ShowFileRequestHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/ShowFileRequestHandler.java
index 193422c..8bf354f 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/ShowFileRequestHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/ShowFileRequestHandler.java
@@ -307,7 +307,7 @@ public class ShowFileRequestHandler extends RequestHandlerBase
     // Make sure the file exists, is readable and is not a hidden file
     if (!zkClient.exists(adminFile)) {
       log.error("Can not find: {}", adminFile);
-      rsp.setException(new SolrException(SolrException.ErrorCode.NOT_FOUND, "Can not find: "
+      rsp.setException(new SolrException(SolrException.ErrorCode.NOT_FOUND, "Can not find admin file: "
           + adminFile));
       return null;
     }
diff --git a/solr/test-framework/src/java/org/apache/solr/SolrTestCase.java b/solr/test-framework/src/java/org/apache/solr/SolrTestCase.java
index 43eaf44..0b14747 100644
--- a/solr/test-framework/src/java/org/apache/solr/SolrTestCase.java
+++ b/solr/test-framework/src/java/org/apache/solr/SolrTestCase.java
@@ -352,7 +352,10 @@ public class SolrTestCase extends LuceneTestCase {
       System.setProperty("socketTimeout", "15000");
       System.setProperty("connTimeout", "10000");
       System.setProperty("solr.cloud.wait-for-updates-with-stale-state-pause", "0");
-      System.setProperty("solr.cloud.starting-recovery-delay-milli-seconds", "500");
+      System.setProperty("solr.cloud.starting-recovery-delay-milli-seconds", "0");
+
+
+      System.setProperty("solr.zkstatewriter.throttle", "0");
 
       System.setProperty("solr.waitForState", "15"); // secs
 
diff --git a/solr/test-framework/src/resources/logconf/log4j2-std-debug.xml b/solr/test-framework/src/resources/logconf/log4j2-std-debug.xml
index 3f09f69..b4cbf94 100644
--- a/solr/test-framework/src/resources/logconf/log4j2-std-debug.xml
+++ b/solr/test-framework/src/resources/logconf/log4j2-std-debug.xml
@@ -57,7 +57,7 @@
 
         <AsyncRoot level="INFO">
             <AppenderRef ref="STDERR_COLOR"/>
-             <AppenderRef ref="FILE"/>
-        </AsyncRoot>
-    </Loggers>
-</Configuration>
+            <!--   <AppenderRef ref="FILE"/>-->
+         </AsyncRoot>
+     </Loggers>
+ </Configuration>


[lucene-solr] 03/03: @1230 Fast response to close.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 61105ed14834c3c7d9ef7300d3d0a100b1e773a5
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Mon Nov 16 10:49:12 2020 -0600

    @1230 Fast response to close.
---
 .../java/org/apache/solr/cloud/ZkController.java   | 420 +++++++++++----------
 .../java/org/apache/solr/handler/IndexFetcher.java |  11 +-
 .../solr/handler/component/CloudReplicaSource.java |   3 +
 .../processor/DistributedZkUpdateProcessor.java    |  13 +-
 .../apache/solr/cloud/BasicDistributedZkTest.java  |   2 +-
 .../solr/cloud/LeaderElectionIntegrationTest.java  |   3 +-
 .../apache/solr/common/cloud/ZkStateReader.java    |  12 +-
 .../solr/cloud/AbstractFullDistribZkTestBase.java  |   2 +-
 8 files changed, 245 insertions(+), 221 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index d63918e..cdda8ef 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -300,6 +300,24 @@ public class ZkController implements Closeable, Runnable {
   // ref is held as a HashSet since we clone the set before notifying to avoid synchronizing too long
   private final Set<OnReconnect> reconnectListeners = ConcurrentHashMap.newKeySet();
 
+  private static class MyLockListener implements LockListener {
+    private final CountDownLatch lockWaitLatch;
+
+    public MyLockListener(CountDownLatch lockWaitLatch) {
+      this.lockWaitLatch = lockWaitLatch;
+    }
+
+    @Override
+    public void lockAcquired() {
+      lockWaitLatch.countDown();
+    }
+
+    @Override
+    public void lockReleased() {
+
+    }
+  }
+
   private class RegisterCoreAsync implements Callable<Object> {
 
     CoreDescriptor descriptor;
@@ -1030,99 +1048,83 @@ public class ZkController implements Closeable, Runnable {
 
   private void init() {
     // nocommit
-//    Runtime.getRuntime().addShutdownHook(new Thread() {
-//      public void run() {
-//        shutdown();
-//        ParWork.close(ParWork.getExecutor());
-//      }
-//
-//    });
-    synchronized (initLock) {
-      log.info("making shutdown watcher for cluster");
-      try {
-        zkClient.exists(CLUSTER_SHUTDOWN, new Watcher() {
-          @Override
-          public void process(WatchedEvent event) {
-            if (Event.EventType.None.equals(event.getType())) {
+    //    Runtime.getRuntime().addShutdownHook(new Thread() {
+    //      public void run() {
+    //        shutdown();
+    //        ParWork.close(ParWork.getExecutor());
+    //      }
+    //
+    //    });
+    //   synchronized (initLock) {
+    log.info("making shutdown watcher for cluster");
+    try {
+      zkClient.exists(CLUSTER_SHUTDOWN, new Watcher() {
+        @Override
+        public void process(WatchedEvent event) {
+          if (Event.EventType.None.equals(event.getType())) {
+            return;
+          }
+
+          log.info("Got even for shutdown {}" + event);
+          if (event.getType().equals(Event.EventType.NodeCreated)) {
+            log.info("Shutdown zk node created, shutting down");
+            shutdown();
+          } else {
+            log.info("Remaking shutdown watcher");
+            Stat stat = null;
+            try {
+              stat = zkClient.exists(CLUSTER_SHUTDOWN, this);
+            } catch (KeeperException e) {
+              SolrException.log(log, e);
+              return;
+            } catch (InterruptedException e) {
+              SolrException.log(log, e);
               return;
             }
-
-            log.info("Got even for shutdown {}" + event);
-            if (event.getType().equals(Event.EventType.NodeCreated)) {
-              log.info("Shutdown zk node created, shutting down");
+            if (stat != null) {
+              log.info("Got shutdown even while remaking watcher, shutting down");
               shutdown();
-            } else {
-              log.info("Remaking shutdown watcher");
-              Stat stat = null;
-              try {
-                stat = zkClient.exists(CLUSTER_SHUTDOWN, this);
-              } catch (KeeperException e) {
-                SolrException.log(log, e);
-                return;
-              } catch (InterruptedException e) {
-                SolrException.log(log, e);
-                return;
-              }
-              if (stat != null) {
-                log.info("Got shutdown even while remaking watcher, shutting down");
-                shutdown();
-              }
             }
           }
-        });
+        }
+      });
 
-      } catch (KeeperException e) {
-        log.error("Zk Exception", e);
-        return;
-      } catch (InterruptedException e) {
-        log.info("interrupted");
-        return;
-      }
-      try {
-        zkClient.mkdirs("/cluster/cluster_lock");
-      } catch (KeeperException.NodeExistsException e) {
-        // okay
-      } catch (KeeperException e) {
-        log.error("Zk Exception", e);
-        return;
-      }
-      boolean createdClusterNodes = false;
+    } catch (KeeperException e) {
+      log.error("Zk Exception", e);
+      return;
+    } catch (InterruptedException e) {
+      log.info("interrupted");
+      return;
+    }
+    try {
+      zkClient.mkdirs("/cluster/cluster_lock");
+    } catch (KeeperException.NodeExistsException e) {
+      // okay
+    } catch (KeeperException e) {
+      log.error("Zk Exception", e);
+      return;
+    }
+    boolean createdClusterNodes = false;
+    try {
+      CountDownLatch lockWaitLatch = new CountDownLatch(1);
+      boolean create = false;
+      DistributedLock lock = new DistributedLock(zkClient, "/cluster/cluster_lock", zkClient.getZkACLProvider().getACLsToAdd("/cluster/cluster_lock"), new MyLockListener(lockWaitLatch));
       try {
-        CountDownLatch lockWaitLatch = new CountDownLatch(1);
-        DistributedLock lock = new DistributedLock(zkClient, "/cluster/cluster_lock", zkClient.getZkACLProvider().getACLsToAdd("/cluster/cluster_lock"),
-        new LockListener(){
-          @Override
-          public void lockAcquired() {
-            lockWaitLatch.countDown();
-          }
-
-          @Override
-          public void lockReleased() {
-
-          }
-        });
-        if (log.isDebugEnabled()) log.debug("get cluster lock");
-        if (!lock.lock()) {
-          boolean success = false;
-          while (!success) {
-            if (isClosed()) {
-              log.warn("Closed, not getting cluster lock");
-              return;
-            }
-            success = lockWaitLatch.await(500, TimeUnit.MILLISECONDS);
-          }
+        //  if (log.isDebugEnabled()) log.debug("get cluster lock");
+        if (lock.lock()) {
+          create = true;
         }
-        try {
+        if (create) {
 
           if (log.isDebugEnabled()) log.debug("got cluster lock");
-          CountDownLatch latch = new CountDownLatch(1);
-          zkClient.getSolrZooKeeper().sync("/cluster/init", (rc, path, ctx) -> {
-            latch.countDown();
-          }, new Object());
-          boolean success = latch.await(10, TimeUnit.SECONDS);
-          if (!success) {
-            throw new SolrException(ErrorCode.SERVER_ERROR, "Timeout calling sync on collection zknode");
-          }
+          //          CountDownLatch latch = new CountDownLatch(1);
+          //          zkClient.getSolrZooKeeper().sync("/cluster/init", (rc, path, ctx) -> {
+          //            latch.countDown();
+          //          }, new Object());
+          //          boolean success = latch.await(10, TimeUnit.SECONDS);
+          //          if (!success) {
+          //            throw new SolrException(ErrorCode.SERVER_ERROR, "Timeout calling sync on collection zknode");
+          //          }
           if (!zkClient.exists("/cluster/init")) {
             try {
               createClusterZkNodes(zkClient);
@@ -1136,134 +1138,132 @@ public class ZkController implements Closeable, Runnable {
             if (log.isDebugEnabled()) log.debug("Cluster zk nodes already exist");
             int currentLiveNodes = zkClient.getChildren(ZkStateReader.LIVE_NODES_ZKNODE, null, true).size();
             if (log.isDebugEnabled()) log.debug("Current live nodes {}", currentLiveNodes);
-//          if (currentLiveNodes == 0) {
-//            log.info("Delete Overseer queues");
-//            // cluster is in a startup state, clear zk queues
-//            List<String> pathsToDelete = Arrays.asList(new String[]{Overseer.OVERSEER_QUEUE, Overseer.OVERSEER_QUEUE_WORK,
-//                    Overseer.OVERSEER_COLLECTION_QUEUE_WORK, Overseer.OVERSEER_COLLECTION_MAP_RUNNING,
-//                    Overseer.OVERSEER_COLLECTION_MAP_COMPLETED, Overseer.OVERSEER_COLLECTION_MAP_FAILURE, Overseer.OVERSEER_ASYNC_IDS});
-//            CountDownLatch latch = new CountDownLatch(pathsToDelete.size());
-//            int[] code = new int[1];
-//            String[] path = new String[1];
-//            boolean[] failed = new boolean[1];
-//
-//            for (String delPath : pathsToDelete) {
-//              zkClient.getSolrZooKeeper().delete(delPath, -1,
-//                      (resultCode, zkpath, context) -> {
-//                        code[0] = resultCode;
-//                        if (resultCode != 0) {
-//                          failed[0] = true;
-//                          path[0] = "" + zkpath;
-//                        }
-//
-//                        latch.countDown();
-//                      }, "");
-//            }
-//            boolean success = false;
-//            log.info("Wait for delete Overseer queues");
-//            try {
-//              success = latch.await(15, TimeUnit.SECONDS);
-//            } catch (InterruptedException e) {
-//              ParWork.propegateInterrupt(e);
-//
-//              throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-//            }
-//
-//            // nocommit, still haackey, do fails right
-//            if (code[0] != 0) {
-//              System.out.println("fail code: "+ code[0]);
-//              KeeperException e = KeeperException.create(KeeperException.Code.get(code[0]), path[0]);
-//              if (e instanceof  NoNodeException) {
-//                // okay
-//              } else {
-//                throw e;
-//              }
-//
-//            }
-//
-//            if (!success) {
-//              throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Timeout waiting for operations to complete");
-//            }
-//          }
+            //          if (currentLiveNodes == 0) {
+            //            log.info("Delete Overseer queues");
+            //            // cluster is in a startup state, clear zk queues
+            //            List<String> pathsToDelete = Arrays.asList(new String[]{Overseer.OVERSEER_QUEUE, Overseer.OVERSEER_QUEUE_WORK,
+            //                    Overseer.OVERSEER_COLLECTION_QUEUE_WORK, Overseer.OVERSEER_COLLECTION_MAP_RUNNING,
+            //                    Overseer.OVERSEER_COLLECTION_MAP_COMPLETED, Overseer.OVERSEER_COLLECTION_MAP_FAILURE, Overseer.OVERSEER_ASYNC_IDS});
+            //            CountDownLatch latch = new CountDownLatch(pathsToDelete.size());
+            //            int[] code = new int[1];
+            //            String[] path = new String[1];
+            //            boolean[] failed = new boolean[1];
+            //
+            //            for (String delPath : pathsToDelete) {
+            //              zkClient.getSolrZooKeeper().delete(delPath, -1,
+            //                      (resultCode, zkpath, context) -> {
+            //                        code[0] = resultCode;
+            //                        if (resultCode != 0) {
+            //                          failed[0] = true;
+            //                          path[0] = "" + zkpath;
+            //                        }
+            //
+            //                        latch.countDown();
+            //                      }, "");
+            //            }
+            //            boolean success = false;
+            //            log.info("Wait for delete Overseer queues");
+            //            try {
+            //              success = latch.await(15, TimeUnit.SECONDS);
+            //            } catch (InterruptedException e) {
+            //              ParWork.propegateInterrupt(e);
+            //
+            //              throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+            //            }
+            //
+            //            // nocommit, still haackey, do fails right
+            //            if (code[0] != 0) {
+            //              System.out.println("fail code: "+ code[0]);
+            //              KeeperException e = KeeperException.create(KeeperException.Code.get(code[0]), path[0]);
+            //              if (e instanceof  NoNodeException) {
+            //                // okay
+            //              } else {
+            //                throw e;
+            //              }
+            //
+            //            }
+            //
+            //            if (!success) {
+            //              throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Timeout waiting for operations to complete");
+            //            }
+            //          }
           }
-
-        } finally {
-          if (log.isDebugEnabled()) log.debug("release cluster lock");
-          lock.unlock();
-        }
-        if (!createdClusterNodes) {
-          // wait?
         }
+      } finally {
+        if (log.isDebugEnabled()) log.debug("release cluster lock");
+        lock.unlock();
+        lock.close();
+      }
+
+      if (!createdClusterNodes) {
+        // wait?
+      }
 
-        zkStateReader = new ZkStateReader(zkClient, () -> {
-          if (cc != null) cc.securityNodeChanged();
+      zkStateReader = new ZkStateReader(zkClient, () -> {
+        if (cc != null) cc.securityNodeChanged();
+      });
+      this.baseURL = zkStateReader.getBaseUrlForNodeName(this.nodeName);
+
+      zkStateReader.createClusterStateWatchersAndUpdate();
+
+      this.overseer = new Overseer(cc.getUpdateShardHandler(), CommonParams.CORES_HANDLER_PATH, zkStateReader, this, cloudConfig);
+      this.overseerRunningMap = Overseer.getRunningMap(zkClient);
+      this.overseerCompletedMap = Overseer.getCompletedMap(zkClient);
+      this.overseerFailureMap = Overseer.getFailureMap(zkClient);
+      this.asyncIdsMap = Overseer.getAsyncIdsMap(zkClient);
+      this.overseerJobQueue = overseer.getStateUpdateQueue();
+      this.overseerCollectionQueue = overseer.getCollectionQueue(zkClient);
+      this.overseerConfigSetQueue = overseer.getConfigSetQueue(zkClient);
+      this.sysPropsCacher = new NodesSysPropsCacher(getSolrCloudManager().getNodeStateProvider(), getNodeName(), zkStateReader);
+      overseerElector = new LeaderElector(this, new ContextKey("overseer", "overseer"), overseerContexts);
+      try (ParWork worker = new ParWork(this, false, true)) {
+        // start the overseer first as following code may need it's processing
+        worker.collect("startOverseer", () -> {
+          ElectionContext context = getOverseerContext();
+          log.info("Overseer setting up context {}", context.leaderProps);
+          overseerElector.setup(context);
+          try {
+            log.info("Overseer joining election {}", context.leaderProps);
+            overseerElector.joinElection(context, false);
+          } catch (KeeperException e) {
+            throw new SolrException(ErrorCode.SERVER_ERROR, e);
+          } catch (InterruptedException e) {
+            ParWork.propagateInterrupt(e);
+            throw new SolrException(ErrorCode.SERVER_ERROR, e);
+          } catch (IOException e) {
+            throw new SolrException(ErrorCode.SERVER_ERROR, e);
+          }
         });
-        this.baseURL = zkStateReader.getBaseUrlForNodeName(this.nodeName);
-
-        zkStateReader.createClusterStateWatchersAndUpdate();
-
-        this.overseer = new Overseer(cc.getUpdateShardHandler(),
-                CommonParams.CORES_HANDLER_PATH, zkStateReader, this, cloudConfig);
-        this.overseerRunningMap = Overseer.getRunningMap(zkClient);
-        this.overseerCompletedMap = Overseer.getCompletedMap(zkClient);
-        this.overseerFailureMap = Overseer.getFailureMap(zkClient);
-        this.asyncIdsMap = Overseer.getAsyncIdsMap(zkClient);
-        this.overseerJobQueue = overseer.getStateUpdateQueue();
-        this.overseerCollectionQueue = overseer.getCollectionQueue(zkClient);
-        this.overseerConfigSetQueue = overseer.getConfigSetQueue(zkClient);
-        this.sysPropsCacher = new NodesSysPropsCacher(getSolrCloudManager().getNodeStateProvider(),
-                getNodeName(), zkStateReader);
-        overseerElector = new LeaderElector(this, new ContextKey("overseer", "overseer"), overseerContexts);
-        try (ParWork worker = new ParWork(this, false, true)) {
-          // start the overseer first as following code may need it's processing
-          worker.collect("startOverseer", () -> {
-            ElectionContext context = getOverseerContext();
-            log.info("Overseer setting up context {}", context.leaderProps);
-            overseerElector.setup(context);
-            try {
-              log.info("Overseer joining election {}", context.leaderProps);
-              overseerElector.joinElection(context, false);
-            } catch (KeeperException e) {
-              throw new SolrException(ErrorCode.SERVER_ERROR, e);
-            } catch (InterruptedException e) {
-              ParWork.propagateInterrupt(e);
-              throw new SolrException(ErrorCode.SERVER_ERROR, e);
-            } catch (IOException e) {
-              throw new SolrException(ErrorCode.SERVER_ERROR, e);
-            }
-          });
 
-//          worker.collect("publishDownState", () -> {
-//            try {
-//              Stat stat = zkClient.exists(ZkStateReader.LIVE_NODES_ZKNODE, null);
-//              if (stat != null && stat.getNumChildren() > 0) {
-//                publishDownStates();
-//              }
-//            } catch (InterruptedException e) {
-//              ParWork.propagateInterrupt(e);
-//              throw new SolrException(ErrorCode.SERVER_ERROR, e);
-//            } catch (KeeperException e) {
-//              throw new SolrException(ErrorCode.SERVER_ERROR, e);
-//            }
-//          });
-        }
-        statePublisher = new StatePublisher(overseerJobQueue);
-        statePublisher.start();
+        //          worker.collect("publishDownState", () -> {
+        //            try {
+        //              Stat stat = zkClient.exists(ZkStateReader.LIVE_NODES_ZKNODE, null);
+        //              if (stat != null && stat.getNumChildren() > 0) {
+        //                publishDownStates();
+        //              }
+        //            } catch (InterruptedException e) {
+        //              ParWork.propagateInterrupt(e);
+        //              throw new SolrException(ErrorCode.SERVER_ERROR, e);
+        //            } catch (KeeperException e) {
+        //              throw new SolrException(ErrorCode.SERVER_ERROR, e);
+        //            }
+        //          });
+      }
+      statePublisher = new StatePublisher(overseerJobQueue);
+      statePublisher.start();
 
-        // Do this last to signal we're up.
-        createEphemeralLiveNode();
+      // Do this last to signal we're up.
+      createEphemeralLiveNode();
 
-        //  publishAndWaitForDownStates();
-      } catch (InterruptedException e) {
-        ParWork.propagateInterrupt(e);
-        throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR,
-                "", e);
-      } catch (KeeperException e) {
-        log.error("", e);
-        throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR,
-                "", e);
-      }
+      //  publishAndWaitForDownStates();
+    } catch (InterruptedException e) {
+      ParWork.propagateInterrupt(e);
+      throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
+    } catch (KeeperException e) {
+      log.error("", e);
+      throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
     }
+    //  }
   }
 
   private synchronized void shutdown() {
@@ -1467,7 +1467,19 @@ public class ZkController implements Closeable, Runnable {
       }
 
       log.info("Wait to see leader for {}, {}", collection, shardId);
-      Replica leader = zkStateReader.getLeaderRetry(collection, shardId, 15000);
+      Replica leader = null;
+      for (int i = 0; i < 15; i++) {
+        try {
+          if (isClosed()) {
+            throw new AlreadyClosedException();
+          }
+
+          leader = zkStateReader.getLeaderRetry(collection, shardId, 3000);
+          break;
+        } catch (TimeoutException timeoutException) {
+
+        }
+      }
 
       String ourUrl = replica.getCoreUrl();
       boolean isLeader = leader.getName() .equals(coreName);
diff --git a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
index f3871a1..2ffa3e7 100644
--- a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
+++ b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
@@ -52,6 +52,7 @@ import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.concurrent.atomic.LongAdder;
 import java.util.function.BooleanSupplier;
@@ -375,7 +376,13 @@ public class IndexFetcher {
     try {
       if (fetchFromLeader) {
         assert !solrCore.isClosed(): "Replication should be stopped before closing the core";
-        Replica replica = getLeaderReplica();
+        Replica replica = null;
+        try {
+          replica = getLeaderReplica();
+        } catch (TimeoutException e) {
+          log.warn("Leader is not available. Index fetch failed due to not finding leader: {}", masterUrl, e);
+          return new IndexFetchResult(IndexFetchResult.FAILED_BY_EXCEPTION_MESSAGE, false, e);
+        }
         CloudDescriptor cd = solrCore.getCoreDescriptor().getCloudDescriptor();
         if (solrCore.getCoreDescriptor().getName().equals(replica.getName())) {
           return IndexFetchResult.EXPECTING_NON_LEADER;
@@ -680,7 +687,7 @@ public class IndexFetcher {
     }
   }
 
-  private Replica getLeaderReplica() throws InterruptedException {
+  private Replica getLeaderReplica() throws InterruptedException, TimeoutException {
     ZkController zkController = solrCore.getCoreContainer().getZkController();
     CloudDescriptor cd = solrCore.getCoreDescriptor().getCloudDescriptor();
     Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(
diff --git a/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java b/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java
index 644bb53..53270cb 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java
@@ -24,6 +24,7 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.TimeoutException;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
 
@@ -228,6 +229,8 @@ class CloudReplicaSource implements ReplicaSource {
                 sliceName, collectionName, clusterState.getCollectionOrNull(collectionName));
           }
           throw e;
+        } catch (TimeoutException e) {
+          e.printStackTrace();
         }
       }
       return replica.getName().equals(shardLeader.getName());
diff --git a/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
index 6ea7e2d..1d38357 100644
--- a/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
+++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
@@ -29,6 +29,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.concurrent.locks.ReentrantLock;
 
 import org.apache.solr.client.solrj.request.UpdateRequest;
@@ -190,7 +191,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
 
         try {
           leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId(), 1000);
-        } catch (InterruptedException e) {
+        } catch (Exception e) {
           ParWork.propagateInterrupt(e);
           throw new SolrException(ErrorCode.SERVER_ERROR,
               "Exception finding leader for shard " + cloudDesc.getShardId(), e);
@@ -563,7 +564,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
           cmdDistrib.distribDelete(cmd, myReplicas, params, false, rollupReplicationTracker, leaderReplicationTracker);
         }
       }
-    } catch (InterruptedException e) {
+    } catch (InterruptedException | TimeoutException e) {
       ParWork.propagateInterrupt(e);
       throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "Interrupted", e);
     }
@@ -609,7 +610,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
           nodes.add(new SolrCmdDistributor.StdNode(zkController.getZkStateReader(), props, collection, shardId));
         }
       }
-    } catch (InterruptedException e) {
+    } catch (Exception e) {
       ParWork.propagateInterrupt(e);
       throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
     }
@@ -637,7 +638,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
     }
     try {
       return zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId()).getCoreUrl();
-    } catch (InterruptedException e) {
+    } catch (InterruptedException | TimeoutException e) {
       ParWork.propagateInterrupt(e);
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Exception during fetching from leader.", e);
     }
@@ -781,7 +782,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
             new SolrCmdDistributor.ForwardNode(zkController.getZkStateReader(), leaderReplica, collection, shardId));
       }
 
-    } catch (InterruptedException e) {
+    } catch (InterruptedException | TimeoutException e) {
       ParWork.propagateInterrupt(e);
       throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
     }
@@ -867,7 +868,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
   }
 
   /** For {@link org.apache.solr.common.params.CollectionParams.CollectionAction#SPLITSHARD} */
-  protected boolean amISubShardLeader(DocCollection coll, Slice parentSlice, String id, SolrInputDocument doc) throws InterruptedException {
+  protected boolean amISubShardLeader(DocCollection coll, Slice parentSlice, String id, SolrInputDocument doc) throws InterruptedException, TimeoutException {
     // Am I the leader of a shard in "construction/recovery" state?
     String myShardId = cloudDesc.getShardId();
     Slice mySlice = coll.getSlice(myShardId);
diff --git a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java
index 1ca6eb8..e667559 100644
--- a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java
@@ -1041,7 +1041,7 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
             if (leader.equals(zkStateReader.getLeaderUrl(oneInstanceCollection2, "shard1", 10000))) {
               return false;
             }
-          } catch (InterruptedException e) {
+          } catch (InterruptedException | TimeoutException e) {
             throw new RuntimeException(e);
           }
           return true;
diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
index 7cbb7be..b4a15b8 100644
--- a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
@@ -20,6 +20,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.LuceneTestCase.Slow;
@@ -142,7 +143,7 @@ public class LeaderElectionIntegrationTest extends SolrCloudTestCase {
     return null;
   }
 
-  private String getLeader(String collection) throws InterruptedException {
+  private String getLeader(String collection) throws InterruptedException, TimeoutException {
 
     ZkNodeProps props = cluster.getSolrClient().getZkStateReader().getLeaderRetry(collection, "shard1", 30000);
     String leader = props.getStr(ZkStateReader.NODE_NAME_PROP);
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
index 6da25ea..407833a 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
@@ -862,7 +862,7 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
     return closed;
   }
 
-  public String getLeaderUrl(String collection, String shard, int timeout) throws InterruptedException {
+  public String getLeaderUrl(String collection, String shard, int timeout) throws InterruptedException, TimeoutException {
     Replica replica = getLeaderRetry(collection, shard, timeout);
     return replica.getCoreUrl();
   }
@@ -898,14 +898,14 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
   /**
    * Get shard leader properties, with retry if none exist.
    */
-  public Replica getLeaderRetry(String collection, String shard) throws InterruptedException {
+  public Replica getLeaderRetry(String collection, String shard) throws InterruptedException, TimeoutException {
     return getLeaderRetry(collection, shard, GET_LEADER_RETRY_DEFAULT_TIMEOUT);
   }
 
   /**
    * Get shard leader properties, with retry if none exist.
    */
-  public Replica getLeaderRetry(String collection, String shard, int timeout) throws InterruptedException {
+  public Replica getLeaderRetry(String collection, String shard, int timeout) throws InterruptedException, TimeoutException {
 
     DocCollection coll = getClusterState().getCollectionOrNull(collection);
 
@@ -913,7 +913,7 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
       Slice slice = coll.getSlice(shard);
       if (slice != null) {
         Replica leader = slice.getLeader();
-        if (leader != null && leader.getState() == Replica.State.ACTIVE) {
+        if (leader != null && leader.getState() == Replica.State.ACTIVE && isNodeLive(leader.getNodeName())) {
           return leader;
         }
       }
@@ -934,7 +934,7 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
 
         Collection<Replica> replicas = slice.getReplicas();
         for (Replica replica : replicas) {
-          if (replica.get("leader") != null && replica.get("leader").equals("true") && replica.getState() == Replica.State.ACTIVE) {
+          if (replica.get("leader") != null && replica.get("leader").equals("true") && replica.getState() == Replica.State.ACTIVE && isNodeLive(replica.getNodeName())) {
             returnLeader.set(replica);
             return true;
           }
@@ -943,7 +943,7 @@ public class ZkStateReader implements SolrCloseable, Replica.NodeNameToBaseUrl {
         return false;
       });
     } catch (TimeoutException e) {
-      throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "No registered leader was found after waiting for "
+      throw new TimeoutException("No registered leader was found after waiting for "
           + timeout + "ms " + ", collection: " + collection + " slice: " + shard + " saw state=" + clusterState.getCollectionOrNull(collection)
           + " with live_nodes=" + clusterState.getLiveNodes());
     }
diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java
index adcf083..1bbdf8a 100644
--- a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java
+++ b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java
@@ -2122,7 +2122,7 @@ public abstract class AbstractFullDistribZkTestBase extends AbstractDistribZkTes
     log.info("Summary of the cluster: {}", builder);
   }
 
-  protected void waitForReplicationFromReplicas(String collectionName, ZkStateReader zkStateReader, TimeOut timeout) throws KeeperException, InterruptedException, IOException {
+  protected void waitForReplicationFromReplicas(String collectionName, ZkStateReader zkStateReader, TimeOut timeout) throws KeeperException, InterruptedException, IOException, TimeoutException {
     log.info("waitForReplicationFromReplicas: {}", collectionName);
     DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName);
     Map<String, CoreContainer> containers = new HashMap<>();


[lucene-solr] 02/03: @1229 Make sure recovery can bail from it's longer retries.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 607c28a0b6c865a5d46de3d5cec4877edf78f298
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Mon Nov 16 10:16:37 2020 -0600

    @1229 Make sure recovery can bail from it's longer retries.
---
 .../org/apache/solr/cloud/RecoveryStrategy.java    | 35 ++++++++++++++--------
 .../java/org/apache/solr/cloud/ZkController.java   |  9 +++++-
 .../org/apache/solr/cloud/ReplaceNodeTest.java     | 10 +++----
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index 4c20928..d3ca1b6 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -36,6 +36,7 @@ import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.params.UpdateParams;
 import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.TimeSource;
 import org.apache.solr.core.CoreContainer;
 import org.apache.solr.core.CoreDescriptor;
 import org.apache.solr.core.DirectoryFactory.DirContext;
@@ -53,6 +54,7 @@ import org.apache.solr.update.UpdateLog.RecoveryInfo;
 import org.apache.solr.update.processor.DistributedUpdateProcessor;
 import org.apache.solr.util.RefCounted;
 import org.apache.solr.util.SolrPluginUtils;
+import org.apache.solr.util.TimeOut;
 import org.apache.solr.util.plugin.NamedListInitializedPlugin;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -771,24 +773,33 @@ public class RecoveryStrategy implements Runnable, Closeable {
           // Since we sleep at 2 seconds sub-intervals in
           // order to check if we were closed, 30 is chosen as the maximum loopCount (2s * 30 = 1m).
 
+          if (isClosed()) {
+            log.info("RecoveryStrategy has been closed");
+            return;
+          }
 
+          long wait = startingRecoveryDelayMilliSeconds;
 
-            if (isClosed()) {
-              log.info("RecoveryStrategy has been closed");
-              return;
-            }
+          if (retries.get() > 1 && retries.get() < 10) {
+            wait = (Math.max(500, startingRecoveryDelayMilliSeconds)) * retries.get();
+          } else if (retries.get() > 0) {
+            wait = TimeUnit.SECONDS.toMillis(60);
+          }
 
-            long wait = startingRecoveryDelayMilliSeconds;
+          log.info("Wait [{}] ms before trying to recover again (attempt={})", wait, retries);
 
-            if (retries.get() > 1 && retries.get() < 10) {
-              wait = (Math.max(500, startingRecoveryDelayMilliSeconds)) * retries.get();
-            } else if (retries.get() > 0) {
-              wait = TimeUnit.SECONDS.toMillis(60);
+          if (wait > 1000) {
+            TimeOut timeout = new TimeOut(wait, TimeUnit.MILLISECONDS, TimeSource.NANO_TIME);
+            while (!timeout.hasTimedOut()) {
+              if (isClosed()) {
+                log.info("RecoveryStrategy has been closed");
+                return;
+              }
+              Thread.sleep(1000);
             }
-
-            log.info("Wait [{}] seconds before trying to recover again (attempt={})", wait, retries);
-
+          } else {
             Thread.sleep(wait);
+          }
 
         } catch (InterruptedException e) {
           ParWork.propagateInterrupt(e, true);
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 0fef5a2..d63918e 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -1103,7 +1103,14 @@ public class ZkController implements Closeable, Runnable {
         });
         if (log.isDebugEnabled()) log.debug("get cluster lock");
         if (!lock.lock()) {
-          lockWaitLatch.await();
+          boolean success = false;
+          while (!success) {
+            if (isClosed()) {
+              log.warn("Closed, not getting cluster lock");
+              return;
+            }
+            success = lockWaitLatch.await(500, TimeUnit.MILLISECONDS);
+          }
         }
         try {
 
diff --git a/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java b/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java
index 2bdea8f..937ea21 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java
@@ -171,11 +171,11 @@ public class ReplaceNodeTest extends SolrCloudTestCase {
         assertFalse(r.getName().endsWith("_n1")); // make sure node was replaced
       }
     }
-    try {
-      CollectionAdminRequest.deleteCollection(coll).process(cluster.getSolrClient());
-    } catch (BaseHttpSolrClient.RemoteSolrException e) {
-      // nocommit fails with Error from server at null: Cannot unload non-existent core [replacenodetest_coll_shard4_replica_n27]}
-    }
+//    try {
+//      CollectionAdminRequest.deleteCollection(coll).process(cluster.getSolrClient());
+//    } catch (BaseHttpSolrClient.RemoteSolrException e) {
+//      // nocommit fails with Error from server at null: Cannot unload non-existent core [replacenodetest_coll_shard4_replica_n27]}
+//    }
   }
 
   public static  CollectionAdminRequest.AsyncCollectionAdminRequest createReplaceNodeRequest(String sourceNode, String targetNode, Boolean parallel) {