You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by is...@apache.org on 2019/12/12 03:28:50 UTC
[lucene-solr] branch master updated: SOLR-13945: SPLITSHARD can
cause data loss due to rollback when final commit fails
This is an automated email from the ASF dual-hosted git repository.
ishan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 8db8ab3 SOLR-13945: SPLITSHARD can cause data loss due to rollback when final commit fails
8db8ab3 is described below
commit 8db8ab3be23b029b5b32e06f2682a905d4e707b2
Author: Ishan Chattopadhyaya <is...@apache.org>
AuthorDate: Thu Dec 12 08:58:34 2019 +0530
SOLR-13945: SPLITSHARD can cause data loss due to rollback when final commit fails
---
solr/CHANGES.txt | 3 ++
.../solr/cloud/api/collections/SplitShardCmd.java | 32 ++++++++++++++++++++--
2 files changed, 32 insertions(+), 3 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 1fdc2c8..499e8f9 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -250,6 +250,9 @@ Bug Fixes
* SOLR-13953: Prometheus exporter in SolrCloud mode limited to 100 nodes (Alex Jablonski via Erick Erickson)
+* SOLR-13945: Fix: SPLITSHARD can cause data loss on a failure to commit after the sub-shards are active and a rollback
+ is done to make parent shard active again (Ishan Chattopadhyaya, ab)
+
Other Changes
---------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
index e08c6e5..333051a 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
@@ -542,6 +542,12 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
// always gets a chance to execute. See SOLR-7673
if (repFactor == 1) {
+ // A commit is needed so that documents are visible when the sub-shard replicas come up
+ // (Note: This commit used to be after the state switch, but was brought here before the state switch
+ // as per SOLR-13945 so that sub shards don't come up empty, momentarily, after being marked active)
+ t = timings.sub("finalCommit");
+ ocmh.commit(results, slice.get(), parentShardLeader);
+ t.stop();
// switch sub shard states to 'active'
log.info("Replication factor is 1 so switching shard states");
Map<String, Object> propMap = new HashMap<>();
@@ -583,9 +589,14 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
log.info("Successfully created all replica shards for all sub-slices " + subSlices);
- t = timings.sub("finalCommit");
- ocmh.commit(results, slice.get(), parentShardLeader);
- t.stop();
+ // The final commit was added in SOLR-4997 so that documents are visible
+ // when the sub-shard replicas come up
+ if (repFactor > 1) {
+ t = timings.sub("finalCommit");
+ ocmh.commit(results, slice.get(), parentShardLeader);
+ t.stop();
+ }
+
if (withTiming) {
results.add(CommonParams.TIMING, timings.asNamedList());
}
@@ -675,6 +686,21 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
return;
}
+ // If parent is inactive and all sub shards are active, then rolling back
+ // to make the parent active again will cause data loss.
+ if (coll.getSlice(parentShard).getState() == Slice.State.INACTIVE) {
+ boolean allSubSlicesActive = true;
+ for (String sub: subSlices) {
+ if (coll.getSlice(sub).getState() != Slice.State.ACTIVE) {
+ allSubSlicesActive = false;
+ break;
+ }
+ }
+ if (allSubSlicesActive) {
+ return;
+ }
+ }
+
// set already created sub shards states to CONSTRUCTION - this prevents them
// from entering into RECOVERY or ACTIVE (SOLR-9455)
final Map<String, Object> propMap = new HashMap<>();