You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/04/19 08:09:56 UTC

lucene-solr:branch_7_3: SOLR-12066: Cleanup deleted core when node start

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7_3 fc5555590 -> ca78f0e0c


SOLR-12066: Cleanup deleted core when node start


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/ca78f0e0
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/ca78f0e0
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/ca78f0e0

Branch: refs/heads/branch_7_3
Commit: ca78f0e0c8ebc88b3b37b8881772f8fbc7918106
Parents: fc55555
Author: Cao Manh Dat <da...@apache.org>
Authored: Fri Mar 30 20:11:39 2018 +0700
Committer: Cao Manh Dat <da...@apache.org>
Committed: Thu Apr 19 15:08:48 2018 +0700

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  2 ++
 .../org/apache/solr/cloud/ZkController.java     | 22 ++++++++++++++++----
 .../org/apache/solr/core/CoreContainer.java     |  7 ++++++-
 .../solr/cloud/DeleteInactiveReplicaTest.java   | 21 ++++++++++++-------
 4 files changed, 40 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ca78f0e0/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 6a724ab..c6aa064 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -38,6 +38,8 @@ Bug Fixes
 
 * SOLR-12146: LIR should skip deleted replicas (Cao Manh Dat)
 
+* SOLR-12066: Cleanup deleted core when node start (Cao Manh Dat)
+
 ==================  7.3.0 ==================
 
 Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ca78f0e0/solr/core/src/java/org/apache/solr/cloud/ZkController.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 5bd7381..a24021a 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -1660,6 +1660,9 @@ public class ZkController {
       Thread.currentThread().interrupt();
       log.error("", e);
       throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
+    } catch (NotInClusterStateException e) {
+      // make the stack trace less verbose
+      throw e;
     } catch (Exception e) {
       log.error("", e);
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", e);
@@ -1687,7 +1690,7 @@ public class ZkController {
     return true;
   }
 
-  private void checkStateInZk(CoreDescriptor cd) throws InterruptedException {
+  private void checkStateInZk(CoreDescriptor cd) throws InterruptedException, NotInClusterStateException {
     if (!Overseer.isLegacy(zkStateReader)) {
       CloudDescriptor cloudDesc = cd.getCloudDescriptor();
       String nodeName = cloudDesc.getCoreNodeName();
@@ -1721,7 +1724,8 @@ public class ZkController {
           }
           Replica replica = slice.getReplica(coreNodeName);
           if (replica == null) {
-            errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId());
+            errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
+                ", ignore the exception if the replica was deleted");
             return false;
           }
           return true;
@@ -1729,8 +1733,9 @@ public class ZkController {
       } catch (TimeoutException e) {
         String error = errorMessage.get();
         if (error == null)
-          error = "Replica " + coreNodeName + " is not present in cluster state";
-        throw new SolrException(ErrorCode.SERVER_ERROR, error + ": " + collectionState.get());
+          error = "coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
+              ", ignore the exception if the replica was deleted";
+        throw new NotInClusterStateException(ErrorCode.SERVER_ERROR, error);
       }
     }
   }
@@ -2710,6 +2715,15 @@ public class ZkController {
     }
   }
 
+  /**
+   * Thrown during pre register process if the replica is not present in clusterstate
+   */
+  public static class NotInClusterStateException extends SolrException {
+    public NotInClusterStateException(ErrorCode code, String msg) {
+      super(code, msg);
+    }
+  }
+
   public boolean checkIfCoreNodeNameAlreadyExists(CoreDescriptor dcore) {
     DocCollection collection = zkStateReader.getClusterState().getCollectionOrNull(dcore.getCollectionName());
     if (collection != null) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ca78f0e0/solr/core/src/java/org/apache/solr/core/CoreContainer.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index b1d8e58..39e6a99 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -670,7 +670,7 @@ public class CoreContainer {
               } catch (InterruptedException e) {
                 Thread.currentThread().interrupt();
               } catch (ExecutionException e) {
-                log.error("Error waiting for SolrCore to be created", e);
+                log.error("Error waiting for SolrCore to be loaded on startup", e.getCause());
               }
             }
           } finally {
@@ -1052,6 +1052,11 @@ public class CoreContainer {
       return core;
     } catch (Exception e) {
       coreInitFailures.put(dcore.getName(), new CoreLoadFailure(dcore, e));
+      if (e instanceof ZkController.NotInClusterStateException && !newCollection) {
+        // this mostly happen when the core is deleted when this node is down
+        unload(dcore.getName(), true, true, true);
+        throw e;
+      }
       solrCores.removeCoreDescriptor(dcore);
       final SolrException solrException = new SolrException(ErrorCode.SERVER_ERROR, "Unable to create core [" + dcore.getName() + "]", e);
       if(core != null && !core.isClosed())

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ca78f0e0/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java
index 0f4ff48..b13f48b 100644
--- a/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java
@@ -17,6 +17,8 @@
 package org.apache.solr.cloud;
 
 import java.lang.invoke.MethodHandles;
+import java.nio.file.Files;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.solr.client.solrj.SolrClient;
 import org.apache.solr.client.solrj.embedded.JettySolrRunner;
@@ -26,7 +28,11 @@ import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkStateReader;
-import org.apache.solr.core.CoreContainer;
+import org.apache.solr.common.util.TimeSource;
+import org.apache.solr.core.CoreDescriptor;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.util.FileUtils;
+import org.apache.solr.util.TimeOut;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.slf4j.Logger;
@@ -64,6 +70,10 @@ public class DeleteInactiveReplicaTest extends SolrCloudTestCase {
     Slice shard = getRandomShard(collectionState);
     Replica replica = getRandomReplica(shard);
     JettySolrRunner jetty = cluster.getReplicaJetty(replica);
+    CoreDescriptor replicaCd;
+    try (SolrCore core = jetty.getCoreContainer().getCore(replica.getCoreName())) {
+      replicaCd = core.getCoreDescriptor();
+    }
     cluster.stopJettySolrRunner(jetty);
 
     waitForState("Expected replica " + replica.getName() + " on down node to be removed from cluster state", collectionName, (n, c) -> {
@@ -81,12 +91,9 @@ public class DeleteInactiveReplicaTest extends SolrCloudTestCase {
     cluster.startJettySolrRunner(jetty);
     log.info("restarted jetty");
 
-    CoreContainer cc = jetty.getCoreContainer();
-    CoreContainer.CoreLoadFailure loadFailure = cc.getCoreInitFailures().get(replica.getCoreName());
-    assertNotNull("Deleted core was still loaded!", loadFailure);
-    assertNotNull(loadFailure.exception.getCause());
-    assertTrue("Unexpected load failure message: " + loadFailure.exception.getCause().getMessage(),
-        loadFailure.exception.getCause().getMessage().contains("does not exist in shard"));
+    TimeOut timeOut = new TimeOut(60, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+    timeOut.waitFor("Expected data dir and instance dir of " + replica.getName() + " is deleted", ()
+        -> !Files.exists(replicaCd.getInstanceDir()) && !FileUtils.fileExists(replicaCd.getDataDir()));
 
     // Check that we can't create a core with no coreNodeName
     try (SolrClient queryClient = getHttpSolrClient(jetty.getBaseUrl().toString())) {