You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by is...@apache.org on 2019/08/29 11:57:59 UTC

[lucene-solr] branch master updated: SOLR-13718: A more targeted fix for SPLITSHARD, thereby avoiding Backup/Restore test failures

This is an automated email from the ASF dual-hosted git repository.

ishan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new f276651  SOLR-13718: A more targeted fix for SPLITSHARD, thereby avoiding Backup/Restore test failures
f276651 is described below

commit f27665198a87692311e7798e835933fc1e9ff986
Author: Ishan Chattopadhyaya <is...@apache.org>
AuthorDate: Thu Aug 29 17:20:46 2019 +0530

    SOLR-13718: A more targeted fix for SPLITSHARD, thereby avoiding Backup/Restore test failures
---
 .../OverseerCollectionMessageHandler.java          | 11 +------
 .../solr/cloud/api/collections/SplitShardCmd.java  | 37 +++++++++++++++++-----
 2 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java
index 6ef7eb3..64b0ef9 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java
@@ -1033,17 +1033,12 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
 
       // If request is async wait for the core admin to complete before returning
       if (asyncId != null) {
-        waitForAsyncCallsToComplete(results, true, msgOnError);
+        waitForAsyncCallsToComplete(results); // TODO: Shouldn't we abort with msgOnError exception when failure?
         shardAsyncIdByNode.clear();
       }
     }
 
     private void waitForAsyncCallsToComplete(NamedList<Object> results) {
-      waitForAsyncCallsToComplete(results, false, null);
-    }
-
-    private void waitForAsyncCallsToComplete(NamedList<Object> results, boolean abortOnFailure, String msgOnError) {
-      boolean failed = false;
       for (Map.Entry<String,String> nodeToAsync:shardAsyncIdByNode) {
         final String node = nodeToAsync.getKey();
         final String shardAsyncId = nodeToAsync.getValue();
@@ -1055,14 +1050,10 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
         if ("failed".equalsIgnoreCase(((String)reqResult.get("STATUS")))) {
           log.error("Error from shard {}: {}", node,  reqResult);
           addFailure(results, node, reqResult);
-          failed = true; 
         } else {
           addSuccess(results, node, reqResult);
         }
       }
-      if (failed && abortOnFailure && msgOnError != null) {
-        throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError);
-      }
     }
 
     /** @deprecated consider to make it private after {@link CreateCollectionCmd} refactoring*/
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
index f18b0b6..da098af 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
@@ -43,6 +43,7 @@ import org.apache.solr.cloud.Overseer;
 import org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler.ShardRequestTracker;
 import org.apache.solr.cloud.overseer.OverseerAction;
 import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.cloud.ClusterState;
 import org.apache.solr.common.cloud.CompositeIdRouter;
 import org.apache.solr.common.cloud.DocCollection;
@@ -225,7 +226,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
           final ShardRequestTracker shardRequestTracker = ocmh.asyncRequestTracker(asyncId);
           shardRequestTracker.sendShardRequest(parentShardLeader.getNodeName(), params, shardHandler);
           SimpleOrderedMap<Object> getRangesResults = new SimpleOrderedMap<>();
-          shardRequestTracker.processResponses(getRangesResults, shardHandler, true, "SPLITSHARD failed to invoke SPLIT.getRanges core admin command");
+          String msgOnError = "SPLITSHARD failed to invoke SPLIT.getRanges core admin command";
+          shardRequestTracker.processResponses(getRangesResults, shardHandler, true, msgOnError);
+          handleFailureOnAsyncRequest(results, msgOnError);
 
           // Extract the recommended splits from the shard response (if it exists)
           // example response: getRangesResults={success={127.0.0.1:62086_solr={responseHeader={status=0,QTime=1},ranges=10-20,3a-3f}}}
@@ -339,7 +342,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
 
       {
         final ShardRequestTracker syncRequestTracker = ocmh.syncRequestTracker();
-        syncRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD failed to create subshard leaders");
+        String msgOnError = "SPLITSHARD failed to create subshard leaders";
+        syncRequestTracker.processResponses(results, shardHandler, true, msgOnError);
+        handleFailureOnAsyncRequest(results, msgOnError);
       }
       t.stop();
       t = timings.sub("waitForSubSliceLeadersAlive");
@@ -361,7 +366,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
           shardRequestTracker.sendShardRequest(nodeName, p, shardHandler);
         }
 
-        shardRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD timed out waiting for subshard leaders to come up");
+        String msgOnError = "SPLITSHARD timed out waiting for subshard leaders to come up";
+        shardRequestTracker.processResponses(results, shardHandler, true, msgOnError);
+        handleFailureOnAsyncRequest(results, msgOnError);
       }
       t.stop();
 
@@ -386,7 +393,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
         final ShardRequestTracker shardRequestTracker = ocmh.asyncRequestTracker(asyncId);
         shardRequestTracker.sendShardRequest(parentShardLeader.getNodeName(), params, shardHandler);
 
-        shardRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD failed to invoke SPLIT core admin command");
+        String msgOnError = "SPLITSHARD failed to invoke SPLIT core admin command";
+        shardRequestTracker.processResponses(results, shardHandler, true, msgOnError);
+        handleFailureOnAsyncRequest(results, msgOnError);
       }
       t.stop();
 
@@ -409,9 +418,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
           shardRequestTracker.sendShardRequest(nodeName, params, shardHandler);
         }
 
-        shardRequestTracker.processResponses(results, shardHandler, true,
-            "SPLITSHARD failed while asking sub shard leaders" +
-                " to apply buffered updates");
+        String msgOnError = "SPLITSHARD failed while asking sub shard leaders to apply buffered updates";
+        shardRequestTracker.processResponses(results, shardHandler, true, msgOnError);
+        handleFailureOnAsyncRequest(results, msgOnError);
       }
       t.stop();
 
@@ -569,7 +578,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
 
       {
         final ShardRequestTracker syncRequestTracker = ocmh.syncRequestTracker();
-        syncRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD failed to create subshard replicas");
+        String msgOnError = "SPLITSHARD failed to create subshard replicas";
+        syncRequestTracker.processResponses(results, shardHandler, true, msgOnError);
+        handleFailureOnAsyncRequest(results, msgOnError);
       }
       t.stop();
 
@@ -599,6 +610,16 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
     }
   }
 
+  /**
+   * In case of async requests, the ShardRequestTracker's processResponses() does not
+   * abort on failure (as it should). Handling this here temporarily for now.
+   */
+  private void handleFailureOnAsyncRequest(NamedList results, String msgOnError) {
+    Object splitResultFailure = results.get("failure");
+    if (splitResultFailure != null) {
+      throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError);
+    }
+  }
   private void checkDiskSpace(String collection, String shard, Replica parentShardLeader) throws SolrException {
     // check that enough disk space is available on the parent leader node
     // otherwise the actual index splitting will always fail