You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by is...@apache.org on 2019/08/29 04:44:52 UTC

[lucene-solr] branch branch_8x updated: SOLR-13718: SPLITSHARD (async) with failures in underlying sub-operations can result in data loss

This is an automated email from the ASF dual-hosted git repository.

ishan pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new d606ffd  SOLR-13718: SPLITSHARD (async) with failures in underlying sub-operations can result in data loss
d606ffd is described below

commit d606ffdea92513a29bd7d7a1af3cfdf556aae93c
Author: Ishan Chattopadhyaya <is...@apache.org>
AuthorDate: Thu Aug 29 10:04:08 2019 +0530

    SOLR-13718: SPLITSHARD (async) with failures in underlying sub-operations can result in data loss
    
      When SPLITSHARD is issued asynchronously, any exception in a sub-operation isn't propagated and the overall
      SPLITSHARD task proceeds as if there were no failures. This results in marking the active parent shard inactive
      and can result in two empty sub-shards, thus causing data loss.
---
 solr/CHANGES.txt                                              |  2 ++
 .../api/collections/OverseerCollectionMessageHandler.java     | 11 ++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 030fc90..66f9132 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -129,6 +129,8 @@ Bug Fixes
 
 * SOLR-13699 - maxChars no longer working on CopyField with javabin (Chris Troullis via noble)
 
+* SOLR-13718: SPLITSHARD (async) with failures in underlying sub-operations can result in data loss (Ishan Chattopadhyaya)
+
 Other Changes
 ----------------------
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java
index 6fbab13..6ef7eb3 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java
@@ -1033,12 +1033,17 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
 
       // If request is async wait for the core admin to complete before returning
       if (asyncId != null) {
-        waitForAsyncCallsToComplete(results);
+        waitForAsyncCallsToComplete(results, true, msgOnError);
         shardAsyncIdByNode.clear();
       }
     }
 
     private void waitForAsyncCallsToComplete(NamedList<Object> results) {
+      waitForAsyncCallsToComplete(results, false, null);
+    }
+
+    private void waitForAsyncCallsToComplete(NamedList<Object> results, boolean abortOnFailure, String msgOnError) {
+      boolean failed = false;
       for (Map.Entry<String,String> nodeToAsync:shardAsyncIdByNode) {
         final String node = nodeToAsync.getKey();
         final String shardAsyncId = nodeToAsync.getValue();
@@ -1050,10 +1055,14 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
         if ("failed".equalsIgnoreCase(((String)reqResult.get("STATUS")))) {
           log.error("Error from shard {}: {}", node,  reqResult);
           addFailure(results, node, reqResult);
+          failed = true; 
         } else {
           addSuccess(results, node, reqResult);
         }
       }
+      if (failed && abortOnFailure && msgOnError != null) {
+        throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError);
+      }
     }
 
     /** @deprecated consider to make it private after {@link CreateCollectionCmd} refactoring*/