You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ho...@apache.org on 2022/10/26 02:36:36 UTC

[solr] branch main updated: SOLR-16416: Register all handlers before doing overseer operations (#1129)

This is an automated email from the ASF dual-hosted git repository.

houston pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new 1738098b348 SOLR-16416: Register all handlers before doing overseer operations (#1129)
1738098b348 is described below

commit 1738098b348a27b3dd5c4f64e12115de20b74b17
Author: Houston Putman <ho...@apache.org>
AuthorDate: Tue Oct 25 22:36:31 2022 -0400

    SOLR-16416: Register all handlers before doing overseer operations (#1129)
    
    Also retry overseerPrioritizer ops on failure.
---
 solr/CHANGES.txt                                   |  2 ++
 .../apache/solr/cloud/OverseerNodePrioritizer.java | 35 +++++++++++++++++++---
 .../java/org/apache/solr/core/CoreContainer.java   | 29 ++++++++++--------
 3 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index d0f86191d9c..4c4631a1879 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -264,6 +264,8 @@ Bug Fixes
 
 * SOLR-16412: Race condition could trigger error on concurrent SizeLimitedDistributedMap cleanup (Patson Luk via noble)
 
+* SOLR-16416: OverseerPrioritizer now runs after all handlers are registered, and retries on failures. (Houston Putman)
+
 Other Changes
 ---------------------
 * SOLR-16351: Upgrade Carrot2 to 4.4.3, upgrade randomizedtesting to 2.8.0. (Dawid Weiss)
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerNodePrioritizer.java b/solr/core/src/java/org/apache/solr/cloud/OverseerNodePrioritizer.java
index de65877b7de..63dca3f1c61 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerNodePrioritizer.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerNodePrioritizer.java
@@ -21,6 +21,7 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import org.apache.solr.client.solrj.impl.ZkDistribStateManager;
+import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.CoreAdminParams;
@@ -117,11 +118,13 @@ public class OverseerNodePrioritizer {
     }
     if (!designateNodeId.equals(electionNodes.get(1))) { // checking if it is already at no:1
       log.info("asking node {} to come join election at head", designateNodeId);
-      invokeOverseerOp(designateNodeId, "rejoinAtHead"); // ask designate to come first
+      invokeOverseerOpWithRetries(
+          designateNodeId, "rejoinAtHead", 5); // ask designate to come first
       if (log.isInfoEnabled()) {
         log.info("asking the old first in line {} to rejoin election  ", electionNodes.get(1));
       }
-      invokeOverseerOp(electionNodes.get(1), "rejoin"); // ask second inline to go behind
+      invokeOverseerOpWithRetries(
+          electionNodes.get(1), "rejoin", 5); // ask second inline to go behind
       if (log.isInfoEnabled()) {
         List<String> newElectionNodes =
             OverseerTaskProcessor.getSortedElectionNodes(
@@ -133,6 +136,28 @@ public class OverseerNodePrioritizer {
     overseer.sendQuitToOverseer(OverseerTaskProcessor.getLeaderId(zkStateReader.getZkClient()));
   }
 
+  private void invokeOverseerOpWithRetries(String electionNode, String op, int retryCount) {
+    boolean successful = false;
+    for (int i = 0; i < retryCount && !successful; i++) {
+      try {
+        invokeOverseerOp(electionNode, op);
+        successful = true;
+      } catch (SolrException e) {
+        if (i < retryCount - 1) {
+          log.warn("Exception occurred while invoking Overseer Operation '{}'. Retrying.", op, e);
+          try {
+            Thread.sleep(100);
+          } catch (InterruptedException ex) {
+            Thread.currentThread().interrupt();
+            break;
+          }
+        } else {
+          throw e;
+        }
+      }
+    }
+  }
+
   private void invokeOverseerOp(String electionNode, String op) {
     ModifiableSolrParams params = new ModifiableSolrParams();
     ShardHandler shardHandler = shardHandlerFactory.getShardHandler();
@@ -149,8 +174,10 @@ public class OverseerNodePrioritizer {
     shardHandler.submit(sreq, replica, sreq.params);
     ShardResponse response = shardHandler.takeCompletedOrError();
     if (response.getException() != null) {
-      log.error(
-          "Exception occurred while invoking Overseer Operation: {}", op, response.getException());
+      throw new SolrException(
+          SolrException.ErrorCode.SERVER_ERROR,
+          "Exception occurred while invoking Overseer Operation: " + op,
+          response.getException());
     }
   }
 }
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index 7fa16410e93..ec5f4fc3002 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -1055,19 +1055,6 @@ public class CoreContainer {
                   clusterSingletons.getSingletons().put(singleton.getName(), singleton);
                 }
               });
-
-      clusterSingletons.setReady();
-      if (NodeRoles.MODE_PREFERRED.equals(nodeRoles.getRoleMode(NodeRoles.Role.OVERSEER))) {
-        try {
-          log.info("This node has been started as a preferred overseer");
-          zkSys.getZkController().setPreferredOverseer();
-        } catch (KeeperException | InterruptedException e) {
-          throw new SolrException(ErrorCode.SERVER_ERROR, e);
-        }
-      }
-      if (!distributedCollectionCommandRunner.isPresent()) {
-        zkSys.getZkController().checkOverseerDesignate();
-      }
     }
 
     final CoreContainer thisCCRef = this;
@@ -1085,6 +1072,22 @@ public class CoreContainer {
             });
     jerseyAppHandler = new ApplicationHandler(containerHandlers.getJerseyEndpoints());
 
+    // Do Node setup logic after all handlers have been registered.
+    if (isZooKeeperAware()) {
+      clusterSingletons.setReady();
+      if (NodeRoles.MODE_PREFERRED.equals(nodeRoles.getRoleMode(NodeRoles.Role.OVERSEER))) {
+        try {
+          log.info("This node has been started as a preferred overseer");
+          zkSys.getZkController().setPreferredOverseer();
+        } catch (KeeperException | InterruptedException e) {
+          throw new SolrException(ErrorCode.SERVER_ERROR, e);
+        }
+      }
+      if (!distributedCollectionCommandRunner.isPresent()) {
+        zkSys.getZkController().checkOverseerDesignate();
+      }
+    }
+
     // This is a bit redundant but these are two distinct concepts for all they're accomplished at
     // the same time.
     status |= LOAD_COMPLETE | INITIAL_CORE_LOAD_COMPLETE;