You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by tf...@apache.org on 2023/05/04 17:06:42 UTC
[solr] branch branch_9x updated: SOLR-16783: Ignore NoNodeException when deleting clusterstate.json on startup (#1624)
This is an automated email from the ASF dual-hosted git repository.
tflobbe pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 9ee857ec29d SOLR-16783: Ignore NoNodeException when deleting clusterstate.json on startup (#1624)
9ee857ec29d is described below
commit 9ee857ec29d4a5e3384eccf1a8d674255682d9ed
Author: Tomas Eduardo Fernandez Lobbe <tf...@apache.org>
AuthorDate: Thu May 4 09:52:28 2023 -0700
SOLR-16783: Ignore NoNodeException when deleting clusterstate.json on startup (#1624)
---
solr/CHANGES.txt | 2 +
.../java/org/apache/solr/cloud/ZkController.java | 10 +++-
.../org/apache/solr/cloud/ZkControllerTest.java | 65 ++++++++++++++++++++++
3 files changed, 76 insertions(+), 1 deletion(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 3187f54233b..0d4841cd346 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -117,6 +117,8 @@ Bug Fixes
* SOLR-16771: Fixed behavior and handling of 'unset' logging levels in /admin/info/logging API and related Admin UI (hossman)
+* SOLR-16783: Fixed race condition deleting empty `clusterstate.json` file that could prevent Solr 9 instances from starting with a NoNodeException (Tomás Fernández Löbbe)
+
Dependency Upgrades
---------------------
* PR#1494: Upgrade forbiddenapis to 3.5 (Uwe Schindler)
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 222f8988a38..241eee4f398 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -551,7 +551,6 @@ public class ZkController implements Closeable {
if (!zkClient.exists(ZkStateReader.UNSUPPORTED_CLUSTER_STATE, true)) {
return;
}
-
final byte[] data =
zkClient.getData(ZkStateReader.UNSUPPORTED_CLUSTER_STATE, null, null, true);
@@ -573,6 +572,15 @@ public class ZkController implements Closeable {
log.error(message);
throw new SolrException(SolrException.ErrorCode.INVALID_STATE, message);
}
+ } catch (KeeperException.NoNodeException e) {
+ // N instances starting at the same time could attempt to delete the file, resulting in N-1
+ // NoNodeExceptions.
+ // If we get to this point, then it's OK to suppress the exception and continue assuming
+ // success.
+ log.debug(
+ "NoNodeException attempting to delete {}. Another instance must have deleted it already",
+ ZkStateReader.UNSUPPORTED_CLUSTER_STATE,
+ e);
} catch (KeeperException e) {
// Convert checked exception to one acceptable by the caller (see also init() further down)
log.error("", e);
diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java b/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
index 85514bc44c0..0f0b4ef28cf 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
@@ -26,7 +26,10 @@ import java.nio.file.Path;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.MapWriter;
@@ -36,6 +39,8 @@ import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CollectionParams;
+import org.apache.solr.common.util.ExecutorUtil;
+import org.apache.solr.common.util.SolrNamedThreadFactory;
import org.apache.solr.common.util.Utils;
import org.apache.solr.core.CloudConfig;
import org.apache.solr.core.CoreContainer;
@@ -408,6 +413,66 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
}
}
+ public void testCheckNoOldClusterstate() throws Exception {
+ Path zkDir = createTempDir("testCheckNoOldClusterstate");
+ ZkTestServer server = new ZkTestServer(zkDir);
+ CoreContainer cc = getCoreContainer();
+ int nThreads = 5;
+ final ZkController[] controllers = new ZkController[nThreads];
+ ExecutorService svc =
+ ExecutorUtil.newMDCAwareFixedThreadPool(
+ nThreads, new SolrNamedThreadFactory("testCheckNoOldClusterstate-"));
+ try {
+ server.run();
+ server
+ .getZkClient()
+ .create(
+ "/clusterstate.json",
+ "{}".getBytes(StandardCharsets.UTF_8),
+ CreateMode.PERSISTENT,
+ true);
+ AtomicInteger idx = new AtomicInteger();
+ CountDownLatch latch = new CountDownLatch(nThreads);
+ CountDownLatch done = new CountDownLatch(nThreads);
+ AtomicReference<Exception> exception = new AtomicReference<>();
+ for (int i = 0; i < nThreads; i++) {
+ svc.submit(
+ () -> {
+ int index = idx.getAndIncrement();
+ latch.countDown();
+ try {
+ latch.await();
+ controllers[index] =
+ new ZkController(
+ cc,
+ server.getZkAddress(),
+ TIMEOUT,
+ new CloudConfig.CloudConfigBuilder("127.0.0.1", 8983 + index, "solr")
+ .build(),
+ () -> null);
+ } catch (Exception e) {
+ exception.compareAndSet(null, e);
+ } finally {
+ done.countDown();
+ }
+ });
+ }
+ done.await();
+ assertFalse(server.getZkClient().exists("/clusterstate.json", true));
+ assertNull(exception.get());
+ } finally {
+ ExecutorUtil.shutdownNowAndAwaitTermination(svc);
+ for (ZkController controller : controllers) {
+ if (controller != null) {
+ controller.close();
+ }
+ }
+ server.getZkClient().close();
+ cc.shutdown();
+ server.shutdown();
+ }
+ }
+
private CoreContainer getCoreContainer() {
return new MockCoreContainer();
}