You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by is...@apache.org on 2019/12/12 03:28:50 UTC

[lucene-solr] branch master updated: SOLR-13945: SPLITSHARD can cause data loss due to rollback when final commit fails

This is an automated email from the ASF dual-hosted git repository.

ishan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 8db8ab3  SOLR-13945: SPLITSHARD can cause data loss due to rollback when final commit fails
8db8ab3 is described below

commit 8db8ab3be23b029b5b32e06f2682a905d4e707b2
Author: Ishan Chattopadhyaya <is...@apache.org>
AuthorDate: Thu Dec 12 08:58:34 2019 +0530

    SOLR-13945: SPLITSHARD can cause data loss due to rollback when final commit fails
---
 solr/CHANGES.txt                                   |  3 ++
 .../solr/cloud/api/collections/SplitShardCmd.java  | 32 ++++++++++++++++++++--
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 1fdc2c8..499e8f9 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -250,6 +250,9 @@ Bug Fixes
 
 * SOLR-13953: Prometheus exporter in SolrCloud mode limited to 100 nodes (Alex Jablonski via Erick Erickson)
 
+* SOLR-13945: Fix: SPLITSHARD can cause data loss on a failure to commit after the sub-shards are active and a rollback
+  is done to make parent shard active again (Ishan Chattopadhyaya, ab)
+
 Other Changes
 ---------------------
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
index e08c6e5..333051a 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
@@ -542,6 +542,12 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
       // always gets a chance to execute. See SOLR-7673
 
       if (repFactor == 1) {
+        // A commit is needed so that documents are visible when the sub-shard replicas come up
+        // (Note: This commit used to be after the state switch, but was brought here before the state switch
+        //  as per SOLR-13945 so that sub shards don't come up empty, momentarily, after being marked active) 
+        t = timings.sub("finalCommit");
+        ocmh.commit(results, slice.get(), parentShardLeader);
+        t.stop();
         // switch sub shard states to 'active'
         log.info("Replication factor is 1 so switching shard states");
         Map<String, Object> propMap = new HashMap<>();
@@ -583,9 +589,14 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
 
       log.info("Successfully created all replica shards for all sub-slices " + subSlices);
 
-      t = timings.sub("finalCommit");
-      ocmh.commit(results, slice.get(), parentShardLeader);
-      t.stop();
+      // The final commit was added in SOLR-4997 so that documents are visible
+      // when the sub-shard replicas come up
+      if (repFactor > 1) {
+        t = timings.sub("finalCommit");
+        ocmh.commit(results, slice.get(), parentShardLeader);
+        t.stop();
+      }
+
       if (withTiming) {
         results.add(CommonParams.TIMING, timings.asNamedList());
       }
@@ -675,6 +686,21 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
       return;
     }
 
+    // If parent is inactive and all sub shards are active, then rolling back
+    // to make the parent active again will cause data loss.
+    if (coll.getSlice(parentShard).getState() == Slice.State.INACTIVE) {
+      boolean allSubSlicesActive = true;
+      for (String sub: subSlices) {
+        if (coll.getSlice(sub).getState() != Slice.State.ACTIVE) {
+          allSubSlicesActive = false;
+          break;
+        }
+      }
+      if (allSubSlicesActive) {
+        return;
+      }
+    }
+
     // set already created sub shards states to CONSTRUCTION - this prevents them
     // from entering into RECOVERY or ACTIVE (SOLR-9455)
     final Map<String, Object> propMap = new HashMap<>();