You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2017/07/05 00:10:24 UTC

[09/10] lucene-solr:feature/autoscaling: SOLR-10879: Make sure we don't lose single replicas when deleting a node.

SOLR-10879: Make sure we don't lose single replicas when deleting a node.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/cb23fa9b
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/cb23fa9b
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/cb23fa9b

Branch: refs/heads/feature/autoscaling
Commit: cb23fa9b4efa5fc7c17f215f507901d459e9aa6f
Parents: ddfa074
Author: Andrzej Bialecki <ab...@apache.org>
Authored: Tue Jul 4 20:19:08 2017 +0200
Committer: Andrzej Bialecki <ab...@apache.org>
Committed: Tue Jul 4 20:19:08 2017 +0200

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  3 +
 .../org/apache/solr/cloud/DeleteNodeCmd.java    | 43 +++++++++++++-
 .../org/apache/solr/cloud/DeleteNodeTest.java   | 61 +++++++++++++++++---
 3 files changed, 96 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cb23fa9b/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 1517163..8d90b02 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -304,6 +304,9 @@ Bug Fixes
 
 * SOLR-10878: MOVEREPLICA command may lose data when replicationFactor is 1. (ab, shalin)
 
+* SOLR-10879: DELETEREPLICA and DELETENODE commands should prevent data loss when
+  replicationFactor is 1. (ab)
+
 Optimizations
 ----------------------
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cb23fa9b/solr/core/src/java/org/apache/solr/cloud/DeleteNodeCmd.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/DeleteNodeCmd.java b/solr/core/src/java/org/apache/solr/cloud/DeleteNodeCmd.java
index afb95a2..51b0956 100644
--- a/solr/core/src/java/org/apache/solr/cloud/DeleteNodeCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/DeleteNodeCmd.java
@@ -19,6 +19,7 @@ package org.apache.solr.cloud;
 
 
 import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 import java.util.concurrent.CountDownLatch;
@@ -26,7 +27,11 @@ import java.util.concurrent.TimeUnit;
 
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.ClusterState;
+import org.apache.solr.common.cloud.DocCollection;
+import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkNodeProps;
+import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.NamedList;
 import org.apache.zookeeper.KeeperException;
 import org.slf4j.Logger;
@@ -54,7 +59,40 @@ public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Source Node: " + node + " is not live");
     }
     List<ZkNodeProps> sourceReplicas = ReplaceNodeCmd.getReplicasOfNode(node, state);
-    cleanupReplicas(results, state, sourceReplicas, ocmh, node, message.getStr(ASYNC));
+    List<String> singleReplicas = verifyReplicaAvailability(sourceReplicas, state);
+    if (!singleReplicas.isEmpty()) {
+      results.add("failure", "Can't delete the only existing non-PULL replica(s) on node " + node + ": " + singleReplicas.toString());
+    } else {
+      cleanupReplicas(results, state, sourceReplicas, ocmh, node, message.getStr(ASYNC));
+    }
+  }
+
+  // collect names of replicas that cannot be deleted
+  static List<String> verifyReplicaAvailability(List<ZkNodeProps> sourceReplicas, ClusterState state) {
+    List<String> res = new ArrayList<>();
+    for (ZkNodeProps sourceReplica : sourceReplicas) {
+      String coll = sourceReplica.getStr(COLLECTION_PROP);
+      String shard = sourceReplica.getStr(SHARD_ID_PROP);
+      String replicaName = sourceReplica.getStr(ZkStateReader.REPLICA_PROP);
+      DocCollection collection = state.getCollection(coll);
+      Slice slice = collection.getSlice(shard);
+      if (slice.getReplicas().size() < 2) {
+        // can't delete the only replica in existence
+        res.add(coll + "/" + shard + "/" + replicaName + ", type=" + sourceReplica.getStr(ZkStateReader.REPLICA_TYPE));
+      } else { // check replica types
+        int otherNonPullReplicas = 0;
+        for (Replica r : slice.getReplicas()) {
+          if (!r.getName().equals(replicaName) && !r.getType().equals(Replica.Type.PULL)) {
+            otherNonPullReplicas++;
+          }
+        }
+        // can't delete - there are no other non-pull replicas
+        if (otherNonPullReplicas == 0) {
+          res.add(coll + "/" + shard + "/" + replicaName + ", type=" + sourceReplica.getStr(ZkStateReader.REPLICA_TYPE));
+        }
+      }
+    }
+    return res;
   }
 
   static void cleanupReplicas(NamedList results,
@@ -67,7 +105,8 @@ public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd {
     for (ZkNodeProps sourceReplica : sourceReplicas) {
       String coll = sourceReplica.getStr(COLLECTION_PROP);
       String shard = sourceReplica.getStr(SHARD_ID_PROP);
-      log.info("Deleting replica for collection={} shard={} on node={}", coll, shard, node);
+      String type = sourceReplica.getStr(ZkStateReader.REPLICA_TYPE);
+      log.info("Deleting replica type={} for collection={} shard={} on node={}", type, coll, shard, node);
       NamedList deleteResult = new NamedList();
       try {
         if (async != null) sourceReplica = sourceReplica.plus(ASYNC, async);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cb23fa9b/solr/core/src/test/org/apache/solr/cloud/DeleteNodeTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteNodeTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteNodeTest.java
index 9f461f0..23a4de7 100644
--- a/solr/core/src/test/org/apache/solr/cloud/DeleteNodeTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/DeleteNodeTest.java
@@ -18,19 +18,29 @@
 package org.apache.solr.cloud;
 
 
+import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.List;
 import java.util.Set;
 
 import org.apache.solr.client.solrj.impl.CloudSolrClient;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.client.solrj.response.RequestStatusState;
+import org.apache.solr.common.cloud.ClusterState;
+import org.apache.solr.common.cloud.DocCollection;
+import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.StrUtils;
 import org.junit.BeforeClass;
 import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class DeleteNodeTest extends SolrCloudTestCase {
-  
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
   @BeforeClass
   public static void setupCluster() throws Exception {
     configureCluster(6)
@@ -47,28 +57,61 @@ public class DeleteNodeTest extends SolrCloudTestCase {
     cluster.waitForAllNodes(5000);
     CloudSolrClient cloudClient = cluster.getSolrClient();
     String coll = "deletenodetest_coll";
-    Set<String> liveNodes = cloudClient.getZkStateReader().getClusterState().getLiveNodes();
+    ClusterState state = cloudClient.getZkStateReader().getClusterState();
+    Set<String> liveNodes = state.getLiveNodes();
     ArrayList<String> l = new ArrayList<>(liveNodes);
     Collections.shuffle(l, random());
     CollectionAdminRequest.Create create = pickRandom(
         CollectionAdminRequest.createCollection(coll, "conf1", 5, 2, 0, 0),
         CollectionAdminRequest.createCollection(coll, "conf1", 5, 1, 1, 0),
-        CollectionAdminRequest.createCollection(coll, "conf1", 5, 0, 1, 1));
+        CollectionAdminRequest.createCollection(coll, "conf1", 5, 0, 1, 1),
+        // check RF=1
+        CollectionAdminRequest.createCollection(coll, "conf1", 5, 1, 0, 0),
+        CollectionAdminRequest.createCollection(coll, "conf1", 5, 0, 1, 0)
+        );
     create.setCreateNodeSet(StrUtils.join(l, ',')).setMaxShardsPerNode(3);
     cloudClient.request(create);
+    state = cloudClient.getZkStateReader().getClusterState();
     String node2bdecommissioned = l.get(0);
+    // check what replicas are on the node, and whether the call should fail
+    boolean shouldFail = false;
+    DocCollection docColl = state.getCollection(coll);
+    log.info("#### DocCollection: " + docColl);
+    List<Replica> replicas = docColl.getReplicas(node2bdecommissioned);
+    if (replicas != null) {
+      for (Replica replica : replicas) {
+        String shard = docColl.getShardId(node2bdecommissioned, replica.getStr(ZkStateReader.CORE_NAME_PROP));
+        Slice slice = docColl.getSlice(shard);
+        boolean hasOtherNonPullReplicas = false;
+        for (Replica r: slice.getReplicas()) {
+          if (!r.getName().equals(replica.getName()) &&
+              !r.getNodeName().equals(node2bdecommissioned) &&
+              r.getType() != Replica.Type.PULL) {
+            hasOtherNonPullReplicas = true;
+            break;
+          }
+        }
+        if (!hasOtherNonPullReplicas) {
+          shouldFail = true;
+          break;
+        }
+      }
+    }
     new CollectionAdminRequest.DeleteNode(node2bdecommissioned).processAsync("003", cloudClient);
     CollectionAdminRequest.RequestStatus requestStatus = CollectionAdminRequest.requestStatus("003");
-    boolean success = false;
+    CollectionAdminRequest.RequestStatusResponse rsp = null;
     for (int i = 0; i < 200; i++) {
-      CollectionAdminRequest.RequestStatusResponse rsp = requestStatus.process(cloudClient);
-      if (rsp.getRequestStatus() == RequestStatusState.COMPLETED) {
-        success = true;
+      rsp = requestStatus.process(cloudClient);
+      if (rsp.getRequestStatus() == RequestStatusState.FAILED || rsp.getRequestStatus() == RequestStatusState.COMPLETED) {
         break;
       }
-      assertFalse(rsp.getRequestStatus() == RequestStatusState.FAILED);
       Thread.sleep(50);
     }
-    assertTrue(success);
+    log.info("####### DocCollection after: " + cloudClient.getZkStateReader().getClusterState().getCollection(coll));
+    if (shouldFail) {
+      assertTrue(String.valueOf(rsp), rsp.getRequestStatus() == RequestStatusState.FAILED);
+    } else {
+      assertFalse(String.valueOf(rsp), rsp.getRequestStatus() == RequestStatusState.FAILED);
+    }
   }
 }