You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by kr...@apache.org on 2023/09/29 18:02:10 UTC

[solr] branch main updated: SOLR-17004: ZkStateReader waitForState should check clusterState before using watchers (#1945)

This is an automated email from the ASF dual-hosted git repository.

krisden pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new 240ae14962a SOLR-17004: ZkStateReader waitForState should check clusterState before using watchers (#1945)
240ae14962a is described below

commit 240ae14962a62192fedaea48d07590dd15ff1891
Author: Kevin Risden <ri...@users.noreply.github.com>
AuthorDate: Fri Sep 29 14:02:04 2023 -0400

    SOLR-17004: ZkStateReader waitForState should check clusterState before using watchers (#1945)
---
 solr/CHANGES.txt                                   |  3 +-
 .../solr/cloud/LeaderElectionIntegrationTest.java  | 10 ++++++-
 .../apache/solr/common/cloud/ZkStateReader.java    | 33 ++++++++++++++++++----
 3 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index cc7d9a46059..df10be97fc9 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -21,7 +21,8 @@ Improvements
 
 Optimizations
 ---------------------
-(No changes)
+
+* SOLR-17004: ZkStateReader waitForState should check clusterState before using watchers (Kevin Risden)
 
 Bug Fixes
 ---------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
index 4491476f030..5da2f862ea2 100644
--- a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
@@ -79,15 +79,22 @@ public class LeaderElectionIntegrationTest extends SolrCloudTestCase {
               .getCoreDescriptor()
               .getCloudDescriptor()
               .getShardId());
+      String jettyNodeName = jetty.getNodeName(); // must get before shutdown
       jetty.stop();
       stoppedRunners.add(jetty);
+      waitForState(
+          "Leader should not be " + jettyNodeName,
+          collection,
+          (n, c) ->
+              c.getLeader("shard1") != null
+                  && !jettyNodeName.equals(c.getLeader("shard1").getNodeName()));
     }
 
     for (JettySolrRunner runner : stoppedRunners) {
       runner.start();
     }
     waitForState(
-        "Expected to see nodes come back " + collection, collection, (n, c) -> n.size() == 6);
+        "Expected to see nodes come back for " + collection, collection, (n, c) -> n.size() == 6);
     CollectionAdminRequest.deleteCollection(collection).process(cluster.getSolrClient());
 
     // testLeaderElectionAfterClientTimeout
@@ -99,6 +106,7 @@ public class LeaderElectionIntegrationTest extends SolrCloudTestCase {
     // timeout the leader
     String leader = getLeader(collection);
     JettySolrRunner jetty = getRunner(leader);
+    assertNotNull(jetty);
     cluster.expireZkSession(jetty);
 
     for (int i = 0; i < 60; i++) { // wait till leader is changed
diff --git a/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/ZkStateReader.java
index 17f7bdbc5d5..4879733e7fe 100644
--- a/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/ZkStateReader.java
+++ b/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/ZkStateReader.java
@@ -936,7 +936,6 @@ public class ZkStateReader implements SolrCloseable {
   /** Get shard leader properties, with retry if none exist. */
   public Replica getLeaderRetry(String collection, String shard, int timeout)
       throws InterruptedException {
-    AtomicReference<DocCollection> coll = new AtomicReference<>();
     AtomicReference<Replica> leader = new AtomicReference<>();
     try {
       waitForState(
@@ -945,7 +944,6 @@ public class ZkStateReader implements SolrCloseable {
           TimeUnit.MILLISECONDS,
           (n, c) -> {
             if (c == null) return false;
-            coll.set(c);
             Replica l = getLeader(n, c, shard);
             if (l != null) {
               log.debug("leader found for {}/{} to be {}", collection, shard, l);
@@ -1802,6 +1800,18 @@ public class ZkStateReader implements SolrCloseable {
       throw new AlreadyClosedException();
     }
 
+    // Check predicate against known clusterState before trying to add watchers
+    if (clusterState != null) {
+      Set<String> liveNodes = clusterState.getLiveNodes();
+      DocCollection docCollection = clusterState.getCollectionOrNull(collection);
+      if (liveNodes != null && docCollection != null) {
+        if (predicate.matches(liveNodes, docCollection)) {
+          log.debug("Found {} directly in clusterState", predicate);
+          return;
+        }
+      }
+    }
+
     final CountDownLatch latch = new CountDownLatch(1);
     waitLatches.add(latch);
     AtomicReference<DocCollection> docCollection = new AtomicReference<>();
@@ -1855,12 +1865,23 @@ public class ZkStateReader implements SolrCloseable {
       throw new AlreadyClosedException();
     }
 
+    // Check predicate against known clusterState before trying to add watchers
+    if (clusterState != null) {
+      DocCollection docCollection = clusterState.getCollectionOrNull(collection);
+      if (docCollection != null) {
+        if (predicate.test(docCollection)) {
+          log.debug("Found {} directly in clusterState", predicate);
+          return docCollection;
+        }
+      }
+    }
+
     final CountDownLatch latch = new CountDownLatch(1);
     waitLatches.add(latch);
-    AtomicReference<DocCollection> docCollection = new AtomicReference<>();
+    AtomicReference<DocCollection> docCollectionReference = new AtomicReference<>();
     DocCollectionWatcher watcher =
         (c) -> {
-          docCollection.set(c);
+          docCollectionReference.set(c);
           boolean matches = predicate.test(c);
           if (matches) latch.countDown();
 
@@ -1875,8 +1896,8 @@ public class ZkStateReader implements SolrCloseable {
             "Timeout waiting to see state for collection="
                 + collection
                 + " :"
-                + docCollection.get());
-      return docCollection.get();
+                + docCollectionReference.get());
+      return docCollectionReference.get();
     } finally {
       removeDocCollectionWatcher(collection, watcher);
       waitLatches.remove(latch);