You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2018/08/31 03:54:18 UTC

lucene-solr:branch_7x: SOLR-11990: When a Collection CREATE command fails because of a policy rule violation, the collection metadata remains in ZooKeeper, even though no replicas are added, so the user has to manually delete the zombie collection.

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7x bcef6a6e0 -> cc5fdbf38


SOLR-11990: When a Collection CREATE command fails because of a policy rule violation, the collection metadata remains in ZooKeeper, even though no replicas are added, so the user has to manually delete the zombie collection.

This is due to a change in behavior made in this issue where collection metadata creation was moved before attempting to use policy to place replicas. This change reverts the behavior by explicitly deleting the metadata when an AssignmentException occurs. Since the policy depends on collection metadata such as WITH_COLLECTION and COLOCATED_WITH properties, moving metadata creation after policy use would have resulted in passing these properties through the entire call stack quite invasively. Therefore the current approach is adopted.

(cherry picked from commit 5670d612e3f2512d85c972b5fc717586118a19d5)


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/cc5fdbf3
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/cc5fdbf3
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/cc5fdbf3

Branch: refs/heads/branch_7x
Commit: cc5fdbf387d03e4ab0c8bc73c14d60d6fdaa3dc5
Parents: bcef6a6
Author: Shalin Shekhar Mangar <sh...@apache.org>
Authored: Fri Aug 31 09:23:25 2018 +0530
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Fri Aug 31 09:24:12 2018 +0530

----------------------------------------------------------------------
 .../solr/cloud/api/collections/Assign.java      | 30 +++++++++++++++++---
 .../api/collections/CreateCollectionCmd.java    | 14 +++++++--
 .../solr/cloud/autoscaling/TestPolicyCloud.java | 20 ++++++++++++-
 3 files changed, 56 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc5fdbf3/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java
index 2541faa..d323510 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java
@@ -250,7 +250,7 @@ public class Assign {
                                                     List<String> shardNames,
                                                     int numNrtReplicas,
                                                     int numTlogReplicas,
-                                                    int numPullReplicas) throws IOException, InterruptedException {
+                                                    int numPullReplicas) throws IOException, InterruptedException, AssignmentException {
     List<Map> rulesMap = (List) message.get("rule");
     String policyName = message.getStr(POLICY);
     AutoScalingConfig autoScalingConfig = cloudManager.getDistribStateManager().getAutoScalingConfig();
@@ -323,7 +323,7 @@ public class Assign {
   // could be created on live nodes given maxShardsPerNode, Replication factor (if from createShard) etc.
   public static List<ReplicaCount> getNodesForNewReplicas(ClusterState clusterState, String collectionName,
                                                           String shard, int nrtReplicas,
-                                                          Object createNodeSet, SolrCloudManager cloudManager) throws IOException, InterruptedException {
+                                                          Object createNodeSet, SolrCloudManager cloudManager) throws IOException, InterruptedException, AssignmentException {
     log.debug("getNodesForNewReplicas() shard: {} , replicas : {} , createNodeSet {}", shard, nrtReplicas, createNodeSet );
     DocCollection coll = clusterState.getCollection(collectionName);
     Integer maxShardsPerNode = coll.getMaxShardsPerNode();
@@ -384,7 +384,7 @@ public class Assign {
                                                               int tlogReplicas,
                                                               int pullReplicas,
                                                               String policyName, SolrCloudManager cloudManager,
-                                                              List<String> nodesList) throws IOException, InterruptedException {
+                                                              List<String> nodesList) throws IOException, InterruptedException, AssignmentException {
     log.debug("shardnames {} NRT {} TLOG {} PULL {} , policy {}, nodeList {}", shardNames, nrtReplicas, tlogReplicas, pullReplicas, policyName, nodesList);
     List<ReplicaPosition> replicaPositions = null;
     AutoScalingConfig autoScalingConfig = cloudManager.getDistribStateManager().getAutoScalingConfig();
@@ -402,7 +402,7 @@ public class Assign {
           nodesList);
       return replicaPositions;
     } catch (Exception e) {
-      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error getting replica locations : " + e.getMessage(), e);
+      throw new AssignmentException("Error getting replica locations : " + e.getMessage(), e);
     } finally {
       if (log.isTraceEnabled()) {
         if (replicaPositions != null)
@@ -484,5 +484,27 @@ public class Assign {
     return nodeNameVsShardCount;
   }
 
+  /**
+   * Thrown if there is an exception while assigning nodes for replicas
+   */
+  public static class AssignmentException extends RuntimeException {
+    public AssignmentException() {
+    }
 
+    public AssignmentException(String message) {
+      super(message);
+    }
+
+    public AssignmentException(String message, Throwable cause) {
+      super(message, cause);
+    }
+
+    public AssignmentException(Throwable cause) {
+      super(cause);
+    }
+
+    public AssignmentException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
+      super(message, cause, enableSuppression, writableStackTrace);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc5fdbf3/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java
index e8f100f..4f66ff9 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java
@@ -177,8 +177,16 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Could not fully create collection: " + collectionName);
       }
 
-      List<ReplicaPosition> replicaPositions = buildReplicaPositions(ocmh.cloudManager, clusterState, message,
-          nodeList, shardNames, sessionWrapper);
+      List<ReplicaPosition> replicaPositions = null;
+      try {
+        replicaPositions = buildReplicaPositions(ocmh.cloudManager, clusterState, message,
+            nodeList, shardNames, sessionWrapper);
+      } catch (Assign.AssignmentException e) {
+        ZkNodeProps deleteMessage = new ZkNodeProps("name", collectionName);
+        new DeleteCollectionCmd(ocmh).call(clusterState, deleteMessage, results);
+        // unwrap the exception
+        throw new SolrException(ErrorCode.SERVER_ERROR, e.getMessage(), e.getCause());
+      }
 
       if (nodeList.isEmpty()) {
         log.debug("Finished create command for collection: {}", collectionName);
@@ -327,7 +335,7 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
   public static List<ReplicaPosition> buildReplicaPositions(SolrCloudManager cloudManager, ClusterState clusterState,
                                                             ZkNodeProps message,
                                                             List<String> nodeList, List<String> shardNames,
-                                                            AtomicReference<PolicyHelper.SessionWrapper> sessionWrapper) throws IOException, InterruptedException {
+                                                            AtomicReference<PolicyHelper.SessionWrapper> sessionWrapper) throws IOException, InterruptedException, Assign.AssignmentException {
     final String collectionName = message.getStr(NAME);
     // look at the replication factor and see if it matches reality
     // if it does not, find best nodes to create more cores

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc5fdbf3/solr/core/src/test/org/apache/solr/cloud/autoscaling/TestPolicyCloud.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/TestPolicyCloud.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/TestPolicyCloud.java
index c626223..d1dcecf 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/TestPolicyCloud.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/TestPolicyCloud.java
@@ -25,6 +25,7 @@ import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.BiConsumer;
 
@@ -54,7 +55,9 @@ import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.TimeSource;
 import org.apache.solr.common.util.Utils;
+import org.apache.solr.util.TimeOut;
 import org.apache.zookeeper.KeeperException;
 import org.junit.After;
 import org.junit.BeforeClass;
@@ -95,7 +98,22 @@ public class TestPolicyCloud extends SolrCloudTestCase {
 
     assertTrue(exp.getMessage().contains("No node can satisfy the rules"));
     assertTrue(exp.getMessage().contains("AutoScaling.error.diagnostics"));
-    CollectionAdminRequest.deleteCollection(collectionName).processAndWait(cluster.getSolrClient(), 60);
+
+    // wait for a while until we don't see the collection
+    TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS, new TimeSource.NanoTimeSource());
+    boolean removed = false;
+    while (! timeout.hasTimedOut()) {
+      timeout.sleep(100);
+      removed = !cluster.getSolrClient().getZkStateReader().getClusterState().hasCollection(collectionName);
+      if (removed) {
+        timeout.sleep(500); // just a bit of time so it's more likely other
+        // readers see on return
+        break;
+      }
+    }
+    if (!removed) {
+      fail("Collection should have been deleted from cluster state but still exists: " + collectionName);
+    }
 
     commands =  "{ set-cluster-policy: [ {cores: '<2', node: '#ANY'} ] }";
     cluster.getSolrClient().request(createAutoScalingRequest(SolrRequest.METHOD.POST, commands));