You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/03/30 13:11:49 UTC

lucene-solr:master: SOLR-12066: Cleanup deleted core when node start

Repository: lucene-solr
Updated Branches:
  refs/heads/master d483108a1 -> 35bfe8979


SOLR-12066: Cleanup deleted core when node start


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/35bfe897
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/35bfe897
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/35bfe897

Branch: refs/heads/master
Commit: 35bfe897901f1b51bce654b49aecd9560bfa797f
Parents: d483108
Author: Cao Manh Dat <da...@apache.org>
Authored: Fri Mar 30 20:11:39 2018 +0700
Committer: Cao Manh Dat <da...@apache.org>
Committed: Fri Mar 30 20:11:39 2018 +0700

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  2 ++
 .../org/apache/solr/cloud/ZkController.java     | 22 ++++++++++---
 .../org/apache/solr/core/CoreContainer.java     |  7 ++++-
 .../apache/solr/cloud/DeleteReplicaTest.java    | 33 ++++++++++++++++++++
 4 files changed, 59 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35bfe897/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 5854e0f..12bc25a 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -110,6 +110,8 @@ Optimizations
 
 * SOLR-12146: LIR should skip deleted replicas (Cao Manh Dat)
 
+* SOLR-12066: Cleanup deleted core when node start (Cao Manh Dat)
+
 Other Changes
 ----------------------
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35bfe897/solr/core/src/java/org/apache/solr/cloud/ZkController.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index c0ddd26..872a8b9 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -1661,6 +1661,9 @@ public class ZkController {
       Thread.currentThread().interrupt();
       log.error("", e);
       throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
+    } catch (NotInClusterStateException e) {
+      // make the stack trace less verbose
+      throw e;
     } catch (Exception e) {
       log.error("", e);
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", e);
@@ -1688,7 +1691,7 @@ public class ZkController {
     return true;
   }
 
-  private void checkStateInZk(CoreDescriptor cd) throws InterruptedException {
+  private void checkStateInZk(CoreDescriptor cd) throws InterruptedException, NotInClusterStateException {
     if (!Overseer.isLegacy(zkStateReader)) {
       CloudDescriptor cloudDesc = cd.getCloudDescriptor();
       String nodeName = cloudDesc.getCoreNodeName();
@@ -1722,7 +1725,8 @@ public class ZkController {
           }
           Replica replica = slice.getReplica(coreNodeName);
           if (replica == null) {
-            errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId());
+            errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
+                ", ignore the exception if the replica was deleted");
             return false;
           }
           return true;
@@ -1730,8 +1734,9 @@ public class ZkController {
       } catch (TimeoutException e) {
         String error = errorMessage.get();
         if (error == null)
-          error = "Replica " + coreNodeName + " is not present in cluster state";
-        throw new SolrException(ErrorCode.SERVER_ERROR, error + ": " + collectionState.get());
+          error = "coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
+              ", ignore the exception if the replica was deleted";
+        throw new NotInClusterStateException(ErrorCode.SERVER_ERROR, error);
       }
     }
   }
@@ -2711,6 +2716,15 @@ public class ZkController {
     }
   }
 
+  /**
+   * Thrown during pre register process if the replica is not present in clusterstate
+   */
+  public static class NotInClusterStateException extends SolrException {
+    public NotInClusterStateException(ErrorCode code, String msg) {
+      super(code, msg);
+    }
+  }
+
   public boolean checkIfCoreNodeNameAlreadyExists(CoreDescriptor dcore) {
     DocCollection collection = zkStateReader.getClusterState().getCollectionOrNull(dcore.getCollectionName());
     if (collection != null) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35bfe897/solr/core/src/java/org/apache/solr/core/CoreContainer.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index b667bc0..74b718c 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -677,7 +677,7 @@ public class CoreContainer {
               } catch (InterruptedException e) {
                 Thread.currentThread().interrupt();
               } catch (ExecutionException e) {
-                log.error("Error waiting for SolrCore to be created", e);
+                log.error("Error waiting for SolrCore to be loaded on startup", e.getCause());
               }
             }
           } finally {
@@ -1063,6 +1063,11 @@ public class CoreContainer {
       return core;
     } catch (Exception e) {
       coreInitFailures.put(dcore.getName(), new CoreLoadFailure(dcore, e));
+      if (e instanceof ZkController.NotInClusterStateException && !newCollection) {
+        // this mostly happen when the core is deleted when this node is down
+        unload(dcore.getName(), true, true, true);
+        throw e;
+      }
       solrCores.removeCoreDescriptor(dcore);
       final SolrException solrException = new SolrException(ErrorCode.SERVER_ERROR, "Unable to create core [" + dcore.getName() + "]", e);
       if(core != null && !core.isClosed())

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35bfe897/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java
index 3208ebd..1a021d7 100644
--- a/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java
@@ -41,7 +41,10 @@ import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.TimeSource;
 import org.apache.solr.common.util.Utils;
+import org.apache.solr.core.CoreDescriptor;
+import org.apache.solr.core.SolrCore;
 import org.apache.solr.core.ZkContainer;
+import org.apache.solr.util.FileUtils;
 import org.apache.solr.util.TimeOut;
 import org.apache.zookeeper.KeeperException;
 import org.junit.BeforeClass;
@@ -153,6 +156,36 @@ public class DeleteReplicaTest extends SolrCloudTestCase {
   }
 
   @Test
+  public void deleteReplicaOnDownNode() throws Exception {
+    final String collectionName = "deleteReplicaOnDownNode";
+    CollectionAdminRequest.createCollection(collectionName, "conf", 1, 2).process(cluster.getSolrClient());
+    waitForState("Expected one shards with two replicas", collectionName, clusterShape(1, 2));
+
+    Slice shard = getCollectionState(collectionName).getSlice("shard1");
+    Replica replica = shard.getReplicas(rep -> !rep.getName().equals(shard.getLeader().getName())).get(0);
+    JettySolrRunner replicaJetty = getJettyForReplica(replica);
+    CoreDescriptor replicaCd;
+    try (SolrCore core = replicaJetty.getCoreContainer().getCore(replica.getCoreName())) {
+      replicaCd = core.getCoreDescriptor();
+    }
+    assertNotNull("Expected core descriptor of "+ replica.getName() + " is not null",replicaCd);
+    String replicaJettyNodeName = replicaJetty.getNodeName();
+
+    // shutdown node of a replica
+    replicaJetty.stop();
+    waitForNodeLeave(replicaJettyNodeName);
+    waitForState("Expected one shards with one replica", collectionName, clusterShape(1, 1));
+    CollectionAdminRequest.deleteReplica(collectionName, shard.getName(), replica.getName()).process(cluster.getSolrClient());
+    waitForState("Expected only one replica left", collectionName, (liveNodes, collectionState) -> collectionState.getReplicas().size() == 1);
+
+    // restart the test and make sure the data get deleted
+    replicaJetty.start();
+    TimeOut timeOut = new TimeOut(60, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+    timeOut.waitFor("Expected data dir and instance dir of " + replica.getName() + " is deleted", ()
+        -> !Files.exists(replicaCd.getInstanceDir()) && !FileUtils.fileExists(replicaCd.getDataDir()));
+  }
+
+  @Test
   public void deleteReplicaByCountForAllShards() throws Exception {
 
     final String collectionName = "deleteByCountNew";