You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by tf...@apache.org on 2023/05/04 17:06:42 UTC

[solr] branch branch_9x updated: SOLR-16783: Ignore NoNodeException when deleting clusterstate.json on startup (#1624)

This is an automated email from the ASF dual-hosted git repository.

tflobbe pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new 9ee857ec29d SOLR-16783: Ignore NoNodeException when deleting clusterstate.json on startup (#1624)
9ee857ec29d is described below

commit 9ee857ec29d4a5e3384eccf1a8d674255682d9ed
Author: Tomas Eduardo Fernandez Lobbe <tf...@apache.org>
AuthorDate: Thu May 4 09:52:28 2023 -0700

    SOLR-16783: Ignore NoNodeException when deleting clusterstate.json on startup (#1624)
---
 solr/CHANGES.txt                                   |  2 +
 .../java/org/apache/solr/cloud/ZkController.java   | 10 +++-
 .../org/apache/solr/cloud/ZkControllerTest.java    | 65 ++++++++++++++++++++++
 3 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 3187f54233b..0d4841cd346 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -117,6 +117,8 @@ Bug Fixes
 
 * SOLR-16771: Fixed behavior and handling of 'unset' logging levels in /admin/info/logging API and related Admin UI (hossman)
 
+* SOLR-16783: Fixed race condition deleting empty `clusterstate.json` file that could prevent Solr 9 instances from starting with a NoNodeException (Tomás Fernández Löbbe)
+
 Dependency Upgrades
 ---------------------
 * PR#1494: Upgrade forbiddenapis to 3.5 (Uwe Schindler)
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 222f8988a38..241eee4f398 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -551,7 +551,6 @@ public class ZkController implements Closeable {
       if (!zkClient.exists(ZkStateReader.UNSUPPORTED_CLUSTER_STATE, true)) {
         return;
       }
-
       final byte[] data =
           zkClient.getData(ZkStateReader.UNSUPPORTED_CLUSTER_STATE, null, null, true);
 
@@ -573,6 +572,15 @@ public class ZkController implements Closeable {
         log.error(message);
         throw new SolrException(SolrException.ErrorCode.INVALID_STATE, message);
       }
+    } catch (KeeperException.NoNodeException e) {
+      // N instances starting at the same time could attempt to delete the file, resulting in N-1
+      // NoNodeExceptions.
+      // If we get to this point, then it's OK to suppress the exception and continue assuming
+      // success.
+      log.debug(
+          "NoNodeException attempting to delete {}. Another instance must have deleted it already",
+          ZkStateReader.UNSUPPORTED_CLUSTER_STATE,
+          e);
     } catch (KeeperException e) {
       // Convert checked exception to one acceptable by the caller (see also init() further down)
       log.error("", e);
diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java b/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
index 85514bc44c0..0f0b4ef28cf 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
@@ -26,7 +26,10 @@ import java.nio.file.Path;
 import java.util.Collections;
 import java.util.List;
 import java.util.Properties;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.common.MapWriter;
@@ -36,6 +39,8 @@ import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.CollectionParams;
+import org.apache.solr.common.util.ExecutorUtil;
+import org.apache.solr.common.util.SolrNamedThreadFactory;
 import org.apache.solr.common.util.Utils;
 import org.apache.solr.core.CloudConfig;
 import org.apache.solr.core.CoreContainer;
@@ -408,6 +413,66 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
     }
   }
 
+  public void testCheckNoOldClusterstate() throws Exception {
+    Path zkDir = createTempDir("testCheckNoOldClusterstate");
+    ZkTestServer server = new ZkTestServer(zkDir);
+    CoreContainer cc = getCoreContainer();
+    int nThreads = 5;
+    final ZkController[] controllers = new ZkController[nThreads];
+    ExecutorService svc =
+        ExecutorUtil.newMDCAwareFixedThreadPool(
+            nThreads, new SolrNamedThreadFactory("testCheckNoOldClusterstate-"));
+    try {
+      server.run();
+      server
+          .getZkClient()
+          .create(
+              "/clusterstate.json",
+              "{}".getBytes(StandardCharsets.UTF_8),
+              CreateMode.PERSISTENT,
+              true);
+      AtomicInteger idx = new AtomicInteger();
+      CountDownLatch latch = new CountDownLatch(nThreads);
+      CountDownLatch done = new CountDownLatch(nThreads);
+      AtomicReference<Exception> exception = new AtomicReference<>();
+      for (int i = 0; i < nThreads; i++) {
+        svc.submit(
+            () -> {
+              int index = idx.getAndIncrement();
+              latch.countDown();
+              try {
+                latch.await();
+                controllers[index] =
+                    new ZkController(
+                        cc,
+                        server.getZkAddress(),
+                        TIMEOUT,
+                        new CloudConfig.CloudConfigBuilder("127.0.0.1", 8983 + index, "solr")
+                            .build(),
+                        () -> null);
+              } catch (Exception e) {
+                exception.compareAndSet(null, e);
+              } finally {
+                done.countDown();
+              }
+            });
+      }
+      done.await();
+      assertFalse(server.getZkClient().exists("/clusterstate.json", true));
+      assertNull(exception.get());
+    } finally {
+      ExecutorUtil.shutdownNowAndAwaitTermination(svc);
+      for (ZkController controller : controllers) {
+        if (controller != null) {
+          controller.close();
+        }
+      }
+      server.getZkClient().close();
+      cc.shutdown();
+      server.shutdown();
+    }
+  }
+
   private CoreContainer getCoreContainer() {
     return new MockCoreContainer();
   }