You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2020/11/16 17:10:40 UTC

[lucene-solr] 02/03: @1229 Make sure recovery can bail from it's longer retries.

This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 607c28a0b6c865a5d46de3d5cec4877edf78f298
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Mon Nov 16 10:16:37 2020 -0600

    @1229 Make sure recovery can bail from it's longer retries.
---
 .../org/apache/solr/cloud/RecoveryStrategy.java    | 35 ++++++++++++++--------
 .../java/org/apache/solr/cloud/ZkController.java   |  9 +++++-
 .../org/apache/solr/cloud/ReplaceNodeTest.java     | 10 +++----
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index 4c20928..d3ca1b6 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -36,6 +36,7 @@ import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.params.UpdateParams;
 import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.TimeSource;
 import org.apache.solr.core.CoreContainer;
 import org.apache.solr.core.CoreDescriptor;
 import org.apache.solr.core.DirectoryFactory.DirContext;
@@ -53,6 +54,7 @@ import org.apache.solr.update.UpdateLog.RecoveryInfo;
 import org.apache.solr.update.processor.DistributedUpdateProcessor;
 import org.apache.solr.util.RefCounted;
 import org.apache.solr.util.SolrPluginUtils;
+import org.apache.solr.util.TimeOut;
 import org.apache.solr.util.plugin.NamedListInitializedPlugin;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -771,24 +773,33 @@ public class RecoveryStrategy implements Runnable, Closeable {
           // Since we sleep at 2 seconds sub-intervals in
           // order to check if we were closed, 30 is chosen as the maximum loopCount (2s * 30 = 1m).
 
+          if (isClosed()) {
+            log.info("RecoveryStrategy has been closed");
+            return;
+          }
 
+          long wait = startingRecoveryDelayMilliSeconds;
 
-            if (isClosed()) {
-              log.info("RecoveryStrategy has been closed");
-              return;
-            }
+          if (retries.get() > 1 && retries.get() < 10) {
+            wait = (Math.max(500, startingRecoveryDelayMilliSeconds)) * retries.get();
+          } else if (retries.get() > 0) {
+            wait = TimeUnit.SECONDS.toMillis(60);
+          }
 
-            long wait = startingRecoveryDelayMilliSeconds;
+          log.info("Wait [{}] ms before trying to recover again (attempt={})", wait, retries);
 
-            if (retries.get() > 1 && retries.get() < 10) {
-              wait = (Math.max(500, startingRecoveryDelayMilliSeconds)) * retries.get();
-            } else if (retries.get() > 0) {
-              wait = TimeUnit.SECONDS.toMillis(60);
+          if (wait > 1000) {
+            TimeOut timeout = new TimeOut(wait, TimeUnit.MILLISECONDS, TimeSource.NANO_TIME);
+            while (!timeout.hasTimedOut()) {
+              if (isClosed()) {
+                log.info("RecoveryStrategy has been closed");
+                return;
+              }
+              Thread.sleep(1000);
             }
-
-            log.info("Wait [{}] seconds before trying to recover again (attempt={})", wait, retries);
-
+          } else {
             Thread.sleep(wait);
+          }
 
         } catch (InterruptedException e) {
           ParWork.propagateInterrupt(e, true);
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 0fef5a2..d63918e 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -1103,7 +1103,14 @@ public class ZkController implements Closeable, Runnable {
         });
         if (log.isDebugEnabled()) log.debug("get cluster lock");
         if (!lock.lock()) {
-          lockWaitLatch.await();
+          boolean success = false;
+          while (!success) {
+            if (isClosed()) {
+              log.warn("Closed, not getting cluster lock");
+              return;
+            }
+            success = lockWaitLatch.await(500, TimeUnit.MILLISECONDS);
+          }
         }
         try {
 
diff --git a/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java b/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java
index 2bdea8f..937ea21 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java
@@ -171,11 +171,11 @@ public class ReplaceNodeTest extends SolrCloudTestCase {
         assertFalse(r.getName().endsWith("_n1")); // make sure node was replaced
       }
     }
-    try {
-      CollectionAdminRequest.deleteCollection(coll).process(cluster.getSolrClient());
-    } catch (BaseHttpSolrClient.RemoteSolrException e) {
-      // nocommit fails with Error from server at null: Cannot unload non-existent core [replacenodetest_coll_shard4_replica_n27]}
-    }
+//    try {
+//      CollectionAdminRequest.deleteCollection(coll).process(cluster.getSolrClient());
+//    } catch (BaseHttpSolrClient.RemoteSolrException e) {
+//      // nocommit fails with Error from server at null: Cannot unload non-existent core [replacenodetest_coll_shard4_replica_n27]}
+//    }
   }
 
   public static  CollectionAdminRequest.AsyncCollectionAdminRequest createReplaceNodeRequest(String sourceNode, String targetNode, Boolean parallel) {