You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/04/19 08:09:56 UTC
lucene-solr:branch_7_3: SOLR-12066: Cleanup deleted core when node
start
Repository: lucene-solr
Updated Branches:
refs/heads/branch_7_3 fc5555590 -> ca78f0e0c
SOLR-12066: Cleanup deleted core when node start
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/ca78f0e0
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/ca78f0e0
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/ca78f0e0
Branch: refs/heads/branch_7_3
Commit: ca78f0e0c8ebc88b3b37b8881772f8fbc7918106
Parents: fc55555
Author: Cao Manh Dat <da...@apache.org>
Authored: Fri Mar 30 20:11:39 2018 +0700
Committer: Cao Manh Dat <da...@apache.org>
Committed: Thu Apr 19 15:08:48 2018 +0700
----------------------------------------------------------------------
solr/CHANGES.txt | 2 ++
.../org/apache/solr/cloud/ZkController.java | 22 ++++++++++++++++----
.../org/apache/solr/core/CoreContainer.java | 7 ++++++-
.../solr/cloud/DeleteInactiveReplicaTest.java | 21 ++++++++++++-------
4 files changed, 40 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ca78f0e0/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 6a724ab..c6aa064 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -38,6 +38,8 @@ Bug Fixes
* SOLR-12146: LIR should skip deleted replicas (Cao Manh Dat)
+* SOLR-12066: Cleanup deleted core when node start (Cao Manh Dat)
+
================== 7.3.0 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ca78f0e0/solr/core/src/java/org/apache/solr/cloud/ZkController.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 5bd7381..a24021a 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -1660,6 +1660,9 @@ public class ZkController {
Thread.currentThread().interrupt();
log.error("", e);
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
+ } catch (NotInClusterStateException e) {
+ // make the stack trace less verbose
+ throw e;
} catch (Exception e) {
log.error("", e);
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", e);
@@ -1687,7 +1690,7 @@ public class ZkController {
return true;
}
- private void checkStateInZk(CoreDescriptor cd) throws InterruptedException {
+ private void checkStateInZk(CoreDescriptor cd) throws InterruptedException, NotInClusterStateException {
if (!Overseer.isLegacy(zkStateReader)) {
CloudDescriptor cloudDesc = cd.getCloudDescriptor();
String nodeName = cloudDesc.getCoreNodeName();
@@ -1721,7 +1724,8 @@ public class ZkController {
}
Replica replica = slice.getReplica(coreNodeName);
if (replica == null) {
- errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId());
+ errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
+ ", ignore the exception if the replica was deleted");
return false;
}
return true;
@@ -1729,8 +1733,9 @@ public class ZkController {
} catch (TimeoutException e) {
String error = errorMessage.get();
if (error == null)
- error = "Replica " + coreNodeName + " is not present in cluster state";
- throw new SolrException(ErrorCode.SERVER_ERROR, error + ": " + collectionState.get());
+ error = "coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
+ ", ignore the exception if the replica was deleted";
+ throw new NotInClusterStateException(ErrorCode.SERVER_ERROR, error);
}
}
}
@@ -2710,6 +2715,15 @@ public class ZkController {
}
}
+ /**
+ * Thrown during pre register process if the replica is not present in clusterstate
+ */
+ public static class NotInClusterStateException extends SolrException {
+ public NotInClusterStateException(ErrorCode code, String msg) {
+ super(code, msg);
+ }
+ }
+
public boolean checkIfCoreNodeNameAlreadyExists(CoreDescriptor dcore) {
DocCollection collection = zkStateReader.getClusterState().getCollectionOrNull(dcore.getCollectionName());
if (collection != null) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ca78f0e0/solr/core/src/java/org/apache/solr/core/CoreContainer.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index b1d8e58..39e6a99 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -670,7 +670,7 @@ public class CoreContainer {
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
} catch (ExecutionException e) {
- log.error("Error waiting for SolrCore to be created", e);
+ log.error("Error waiting for SolrCore to be loaded on startup", e.getCause());
}
}
} finally {
@@ -1052,6 +1052,11 @@ public class CoreContainer {
return core;
} catch (Exception e) {
coreInitFailures.put(dcore.getName(), new CoreLoadFailure(dcore, e));
+ if (e instanceof ZkController.NotInClusterStateException && !newCollection) {
+ // this mostly happen when the core is deleted when this node is down
+ unload(dcore.getName(), true, true, true);
+ throw e;
+ }
solrCores.removeCoreDescriptor(dcore);
final SolrException solrException = new SolrException(ErrorCode.SERVER_ERROR, "Unable to create core [" + dcore.getName() + "]", e);
if(core != null && !core.isClosed())
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ca78f0e0/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java
index 0f4ff48..b13f48b 100644
--- a/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java
@@ -17,6 +17,8 @@
package org.apache.solr.cloud;
import java.lang.invoke.MethodHandles;
+import java.nio.file.Files;
+import java.util.concurrent.TimeUnit;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
@@ -26,7 +28,11 @@ import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkStateReader;
-import org.apache.solr.core.CoreContainer;
+import org.apache.solr.common.util.TimeSource;
+import org.apache.solr.core.CoreDescriptor;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.util.FileUtils;
+import org.apache.solr.util.TimeOut;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
@@ -64,6 +70,10 @@ public class DeleteInactiveReplicaTest extends SolrCloudTestCase {
Slice shard = getRandomShard(collectionState);
Replica replica = getRandomReplica(shard);
JettySolrRunner jetty = cluster.getReplicaJetty(replica);
+ CoreDescriptor replicaCd;
+ try (SolrCore core = jetty.getCoreContainer().getCore(replica.getCoreName())) {
+ replicaCd = core.getCoreDescriptor();
+ }
cluster.stopJettySolrRunner(jetty);
waitForState("Expected replica " + replica.getName() + " on down node to be removed from cluster state", collectionName, (n, c) -> {
@@ -81,12 +91,9 @@ public class DeleteInactiveReplicaTest extends SolrCloudTestCase {
cluster.startJettySolrRunner(jetty);
log.info("restarted jetty");
- CoreContainer cc = jetty.getCoreContainer();
- CoreContainer.CoreLoadFailure loadFailure = cc.getCoreInitFailures().get(replica.getCoreName());
- assertNotNull("Deleted core was still loaded!", loadFailure);
- assertNotNull(loadFailure.exception.getCause());
- assertTrue("Unexpected load failure message: " + loadFailure.exception.getCause().getMessage(),
- loadFailure.exception.getCause().getMessage().contains("does not exist in shard"));
+ TimeOut timeOut = new TimeOut(60, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+ timeOut.waitFor("Expected data dir and instance dir of " + replica.getName() + " is deleted", ()
+ -> !Files.exists(replicaCd.getInstanceDir()) && !FileUtils.fileExists(replicaCd.getDataDir()));
// Check that we can't create a core with no coreNodeName
try (SolrClient queryClient = getHttpSolrClient(jetty.getBaseUrl().toString())) {