You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2020/11/16 17:10:40 UTC
[lucene-solr] 02/03: @1229 Make sure recovery can bail from it's
longer retries.
This is an automated email from the ASF dual-hosted git repository.
markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
commit 607c28a0b6c865a5d46de3d5cec4877edf78f298
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Mon Nov 16 10:16:37 2020 -0600
@1229 Make sure recovery can bail from it's longer retries.
---
.../org/apache/solr/cloud/RecoveryStrategy.java | 35 ++++++++++++++--------
.../java/org/apache/solr/cloud/ZkController.java | 9 +++++-
.../org/apache/solr/cloud/ReplaceNodeTest.java | 10 +++----
3 files changed, 36 insertions(+), 18 deletions(-)
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index 4c20928..d3ca1b6 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -36,6 +36,7 @@ import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.TimeSource;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.core.DirectoryFactory.DirContext;
@@ -53,6 +54,7 @@ import org.apache.solr.update.UpdateLog.RecoveryInfo;
import org.apache.solr.update.processor.DistributedUpdateProcessor;
import org.apache.solr.util.RefCounted;
import org.apache.solr.util.SolrPluginUtils;
+import org.apache.solr.util.TimeOut;
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -771,24 +773,33 @@ public class RecoveryStrategy implements Runnable, Closeable {
// Since we sleep at 2 seconds sub-intervals in
// order to check if we were closed, 30 is chosen as the maximum loopCount (2s * 30 = 1m).
+ if (isClosed()) {
+ log.info("RecoveryStrategy has been closed");
+ return;
+ }
+ long wait = startingRecoveryDelayMilliSeconds;
- if (isClosed()) {
- log.info("RecoveryStrategy has been closed");
- return;
- }
+ if (retries.get() > 1 && retries.get() < 10) {
+ wait = (Math.max(500, startingRecoveryDelayMilliSeconds)) * retries.get();
+ } else if (retries.get() > 0) {
+ wait = TimeUnit.SECONDS.toMillis(60);
+ }
- long wait = startingRecoveryDelayMilliSeconds;
+ log.info("Wait [{}] ms before trying to recover again (attempt={})", wait, retries);
- if (retries.get() > 1 && retries.get() < 10) {
- wait = (Math.max(500, startingRecoveryDelayMilliSeconds)) * retries.get();
- } else if (retries.get() > 0) {
- wait = TimeUnit.SECONDS.toMillis(60);
+ if (wait > 1000) {
+ TimeOut timeout = new TimeOut(wait, TimeUnit.MILLISECONDS, TimeSource.NANO_TIME);
+ while (!timeout.hasTimedOut()) {
+ if (isClosed()) {
+ log.info("RecoveryStrategy has been closed");
+ return;
+ }
+ Thread.sleep(1000);
}
-
- log.info("Wait [{}] seconds before trying to recover again (attempt={})", wait, retries);
-
+ } else {
Thread.sleep(wait);
+ }
} catch (InterruptedException e) {
ParWork.propagateInterrupt(e, true);
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 0fef5a2..d63918e 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -1103,7 +1103,14 @@ public class ZkController implements Closeable, Runnable {
});
if (log.isDebugEnabled()) log.debug("get cluster lock");
if (!lock.lock()) {
- lockWaitLatch.await();
+ boolean success = false;
+ while (!success) {
+ if (isClosed()) {
+ log.warn("Closed, not getting cluster lock");
+ return;
+ }
+ success = lockWaitLatch.await(500, TimeUnit.MILLISECONDS);
+ }
}
try {
diff --git a/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java b/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java
index 2bdea8f..937ea21 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java
@@ -171,11 +171,11 @@ public class ReplaceNodeTest extends SolrCloudTestCase {
assertFalse(r.getName().endsWith("_n1")); // make sure node was replaced
}
}
- try {
- CollectionAdminRequest.deleteCollection(coll).process(cluster.getSolrClient());
- } catch (BaseHttpSolrClient.RemoteSolrException e) {
- // nocommit fails with Error from server at null: Cannot unload non-existent core [replacenodetest_coll_shard4_replica_n27]}
- }
+// try {
+// CollectionAdminRequest.deleteCollection(coll).process(cluster.getSolrClient());
+// } catch (BaseHttpSolrClient.RemoteSolrException e) {
+// // nocommit fails with Error from server at null: Cannot unload non-existent core [replacenodetest_coll_shard4_replica_n27]}
+// }
}
public static CollectionAdminRequest.AsyncCollectionAdminRequest createReplaceNodeRequest(String sourceNode, String targetNode, Boolean parallel) {