You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by no...@apache.org on 2016/09/07 09:57:19 UTC

[31/50] [abbrv] lucene-solr:apiv2: SOLR-9439: The delete shard API has been made more resilient against failures resulting from non-existent cores.

SOLR-9439: The delete shard API has been made more resilient against failures resulting from non-existent cores.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/02b97a29
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/02b97a29
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/02b97a29

Branch: refs/heads/apiv2
Commit: 02b97a29b747e439bba8ad95a0269f959bea965e
Parents: 2700b95
Author: Shalin Shekhar Mangar <sh...@apache.org>
Authored: Tue Aug 30 23:44:22 2016 +0530
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Tue Aug 30 23:44:22 2016 +0530

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  3 +-
 .../org/apache/solr/cloud/DeleteShardCmd.java   | 80 +++++++++++++++-----
 .../org/apache/solr/cloud/SplitShardCmd.java    | 24 +-----
 .../org/apache/solr/core/CoreContainer.java     |  4 +-
 4 files changed, 64 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/02b97a29/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 5f8694f..15071db 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -81,7 +81,8 @@ Bug Fixes
 
 * SOLR-9445: Admin requests are retried by CloudSolrClient and LBHttpSolrClient on failure. (shalin)
 
-* SOLR-9439: Shard split clean up logic for older failed splits is faulty. (shalin)
+* SOLR-9439: Shard split clean up logic for older failed splits is faulty. The delete shard API
+  has also been made more resilient against failures resulting from non-existent cores. (shalin)
 
 * SOLR-9430: Fix locale lookup in DIH <propertyWriter/> to use BCP47 language tags
   to be consistent with other places in Solr. Language names still work for backwards

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/02b97a29/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java
index a7f6d5b..41b74d5 100644
--- a/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java
@@ -16,10 +16,14 @@
  * limitations under the License.
  */
 package org.apache.solr.cloud;
+
 import java.lang.invoke.MethodHandles;
-import java.util.Collections;
+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
 import java.util.Map;
+import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
 
 import org.apache.solr.cloud.OverseerCollectionMessageHandler.Cmd;
@@ -27,18 +31,23 @@ import org.apache.solr.cloud.overseer.OverseerAction;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.ClusterState;
 import org.apache.solr.common.cloud.DocCollection;
+import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.CoreAdminParams;
-import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.common.util.Utils;
-import org.apache.solr.handler.component.ShardHandler;
 import org.apache.solr.util.TimeOut;
+import org.apache.zookeeper.KeeperException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
+import static org.apache.solr.common.cloud.ZkStateReader.NODE_NAME_PROP;
+import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
+import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETEREPLICA;
 import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESHARD;
 import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
 
@@ -87,24 +96,42 @@ public class DeleteShardCmd implements Cmd {
       inQueue.offer(Utils.toJSON(m));
     }
 
-    ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
-
     String asyncId = message.getStr(ASYNC);
-    Map<String, String> requestMap = null;
-    if (asyncId != null) {
-      requestMap = new HashMap<>(slice.getReplicas().size(), 1.0f);
-    }
 
     try {
-      ModifiableSolrParams params = new ModifiableSolrParams();
-      params.set(CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.UNLOAD.toString());
-      params.set(CoreAdminParams.DELETE_INDEX, message.getBool(CoreAdminParams.DELETE_INDEX, true));
-      params.set(CoreAdminParams.DELETE_INSTANCE_DIR, message.getBool(CoreAdminParams.DELETE_INSTANCE_DIR, true));
-      params.set(CoreAdminParams.DELETE_DATA_DIR, message.getBool(CoreAdminParams.DELETE_DATA_DIR, true));
-
-      ocmh.sliceCmd(clusterState, params, null, slice, shardHandler, asyncId, requestMap);
-
-      ocmh.processResponses(results, shardHandler, true, "Failed to delete shard", asyncId, requestMap, Collections.emptySet());
+      List<ZkNodeProps> replicas = getReplicasForSlice(collectionName, slice);
+      CountDownLatch cleanupLatch = new CountDownLatch(replicas.size());
+      for (ZkNodeProps r : replicas) {
+        final ZkNodeProps replica = r.plus(message.getProperties()).plus("parallel", "true").plus(ASYNC, asyncId);
+        log.info("Deleting replica for collection={} shard={} on node={}", replica.getStr(COLLECTION_PROP), replica.getStr(SHARD_ID_PROP), replica.getStr(CoreAdminParams.NODE));
+        NamedList deleteResult = new NamedList();
+        try {
+          ((DeleteReplicaCmd)ocmh.commandMap.get(DELETEREPLICA)).deleteReplica(clusterState, replica, deleteResult, () -> {
+            cleanupLatch.countDown();
+            if (deleteResult.get("failure") != null) {
+              synchronized (results) {
+                results.add("failure", String.format(Locale.ROOT, "Failed to delete replica for collection=%s shard=%s" +
+                    " on node=%s", replica.getStr(COLLECTION_PROP), replica.getStr(SHARD_ID_PROP), replica.getStr(NODE_NAME_PROP)));
+              }
+            }
+            SimpleOrderedMap success = (SimpleOrderedMap) deleteResult.get("success");
+            if (success != null) {
+              synchronized (results)  {
+                results.add("success", success);
+              }
+            }
+          });
+        } catch (KeeperException e) {
+          log.warn("Error deleting replica: " + r, e);
+          cleanupLatch.countDown();
+        } catch (Exception e) {
+          log.warn("Error deleting replica: " + r, e);
+          cleanupLatch.countDown();
+          throw e;
+        }
+      }
+      log.debug("Waiting for delete shard action to complete");
+      cleanupLatch.await(5, TimeUnit.MINUTES);
 
       ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETESHARD.toLower(), ZkStateReader.COLLECTION_PROP,
           collectionName, ZkStateReader.SHARD_ID_PROP, sliceId);
@@ -114,7 +141,7 @@ public class DeleteShardCmd implements Cmd {
       // wait for a while until we don't see the shard
       TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS);
       boolean removed = false;
-      while (! timeout.hasTimedOut()) {
+      while (!timeout.hasTimedOut()) {
         Thread.sleep(100);
         DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName);
         removed = collection.getSlice(sliceId) == null;
@@ -129,7 +156,6 @@ public class DeleteShardCmd implements Cmd {
       }
 
       log.info("Successfully deleted collection: " + collectionName + ", shard: " + sliceId);
-
     } catch (SolrException e) {
       throw e;
     } catch (Exception e) {
@@ -137,4 +163,18 @@ public class DeleteShardCmd implements Cmd {
           "Error executing delete operation for collection: " + collectionName + " shard: " + sliceId, e);
     }
   }
+
+  private List<ZkNodeProps> getReplicasForSlice(String collectionName, Slice slice) {
+    List<ZkNodeProps> sourceReplicas = new ArrayList<>();
+    for (Replica replica : slice.getReplicas()) {
+      ZkNodeProps props = new ZkNodeProps(
+          COLLECTION_PROP, collectionName,
+          SHARD_ID_PROP, slice.getName(),
+          ZkStateReader.CORE_NAME_PROP, replica.getCoreName(),
+          ZkStateReader.REPLICA_PROP, replica.getName(),
+          CoreAdminParams.NODE, replica.getNodeName());
+      sourceReplicas.add(props);
+    }
+    return sourceReplicas;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/02b97a29/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
index 4463285..3361a5f 100644
--- a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
@@ -219,8 +219,6 @@ public class SplitShardCmd implements Cmd {
             ZkNodeProps m = new ZkNodeProps(propMap);
             try {
               ocmh.commandMap.get(DELETESHARD).call(clusterState, m, new NamedList());
-            } catch (SolrException e) {
-              throwIfNotNonExistentCoreException(subSlice, e);
             } catch (Exception e) {
               throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
                   e);
@@ -233,7 +231,7 @@ public class SplitShardCmd implements Cmd {
 
       if (oldShardsDeleted) {
         // refresh the locally cached cluster state
-        zkStateReader.forceUpdateCollection(collectionName);
+        // we know we have the latest because otherwise deleteshard would have failed
         clusterState = zkStateReader.getClusterState();
         collection = clusterState.getCollection(collectionName);
       }
@@ -471,24 +469,4 @@ public class SplitShardCmd implements Cmd {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, null, e);
     }
   }
-
-  private void throwIfNotNonExistentCoreException(String subSlice, SolrException e) {
-    Throwable t = e;
-    String cause = null;
-    while (t != null) {
-      if (t instanceof SolrException) {
-        SolrException solrException = (SolrException) t;
-        cause = solrException.getMetadata("cause");
-        if (cause != null && !"NonExistentCore".equals(cause)) {
-          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
-              e);
-        }
-      }
-      t = t.getCause();
-    }
-    if (!"NonExistentCore".equals(cause)) {
-      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
-          e);
-    }
-  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/02b97a29/solr/core/src/java/org/apache/solr/core/CoreContainer.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index 59fe383..0b996b8 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -1019,9 +1019,7 @@ public class CoreContainer {
 
     CoreDescriptor cd = solrCores.getCoreDescriptor(name);
     if (cd == null) {
-      SolrException solrException = new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]");
-      solrException.setMetadata("cause", "NonExistentCore");
-      throw solrException;
+      throw new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]");
     }
 
     boolean close = solrCores.isLoadedNotPendingClose(name);