You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by il...@apache.org on 2021/02/27 23:53:58 UTC

[lucene-solr] branch master updated: SOLR-14928: add exponential backoff wait time when Compare And Swap fails in distributed cluster state update due to concurrent update (#2438)

This is an automated email from the ASF dual-hosted git repository.

ilan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 1fff174  SOLR-14928: add exponential backoff wait time when Compare And Swap fails in distributed cluster state update due to concurrent update (#2438)
1fff174 is described below

commit 1fff1746909361aaccab8b5c146026eba9fbeb88
Author: Ilan Ginzburg <il...@gmail.com>
AuthorDate: Sun Feb 28 00:53:42 2021 +0100

    SOLR-14928: add exponential backoff wait time when Compare And Swap fails in distributed cluster state update due to concurrent update (#2438)
---
 .../java/org/apache/solr/cloud/DistributedClusterStateUpdater.java | 7 +++++--
 .../apache/solr/cloud/api/collections/CollectionHandlingUtils.java | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/DistributedClusterStateUpdater.java b/solr/core/src/java/org/apache/solr/cloud/DistributedClusterStateUpdater.java
index f57cc31..e48b7ce 100644
--- a/solr/core/src/java/org/apache/solr/cloud/DistributedClusterStateUpdater.java
+++ b/solr/core/src/java/org/apache/solr/cloud/DistributedClusterStateUpdater.java
@@ -18,6 +18,7 @@
 package org.apache.solr.cloud;
 
 import org.apache.solr.client.solrj.cloud.SolrCloudManager;
+import org.apache.solr.cloud.api.collections.CollectionHandlingUtils;
 import org.apache.solr.cloud.overseer.*;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.*;
@@ -446,9 +447,11 @@ public class DistributedClusterStateUpdater {
         }
         // We've tried to update an existing state.json and got a BadVersionException. We'll try again a few times.
         // When only two threads compete, no point in waiting: if we lost this time we'll get it next time right away.
-        // But if more threads compete, then waiting a bit (random delay) can improve our chances. The delay should likely
-        // be proportional to the time between reading the cluster state and updating it. We can measure it in the loop above.
+        // But if more threads compete, then waiting a bit (random delay) can improve our chances. The delay should in
+        // theory grow as the number of concurrent threads attempting updates increase, but we don't know that number, so
+        // doing exponential backoff instead.
         // With "per replica states" collections, concurrent attempts of even just two threads are expected to be extremely rare.
+        Thread.sleep(CollectionHandlingUtils.RANDOM.nextInt(attempt < 13 ? 1 << attempt : 1 << 13)); // max wait 2^13ms=8.192 sec
       }
 
       // We made quite a few attempts but failed repeatedly. This is pretty bad but we can't loop trying forever.
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/CollectionHandlingUtils.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/CollectionHandlingUtils.java
index 439d7c7..7707566 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/CollectionHandlingUtils.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/CollectionHandlingUtils.java
@@ -111,7 +111,7 @@ public class CollectionHandlingUtils {
       DocCollection.PER_REPLICA_STATE, null,
       ZkStateReader.PULL_REPLICAS, "0"));
 
-  protected static final Random RANDOM;
+  public static final Random RANDOM;
   static {
     // We try to make things reproducible in the context of our tests by initializing the random instance
     // based on the current seed