You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2016/09/12 11:38:52 UTC
lucene-solr:branch_6x: SOLR-9438: Shard split can be marked
successful and sub-shard states switched to 'active' even when one or more
sub-shards replicas do not recover due to the leader crashing or restarting
between the time the replicas are created a
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x 66a51b76b -> 1b03c9403
SOLR-9438: Shard split can be marked successful and sub-shard states switched to 'active' even when one or more sub-shards replicas do not recover due to the leader crashing or restarting between the time the replicas are created and before they can recover
(cherry picked from commit f177a66)
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/1b03c940
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/1b03c940
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/1b03c940
Branch: refs/heads/branch_6x
Commit: 1b03c940398de384c60fb0083a82ddb601db3909
Parents: 66a51b7
Author: Shalin Shekhar Mangar <sh...@apache.org>
Authored: Mon Sep 12 17:00:46 2016 +0530
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Mon Sep 12 17:08:40 2016 +0530
----------------------------------------------------------------------
solr/CHANGES.txt | 3 +
.../org/apache/solr/cloud/DeleteShardCmd.java | 2 +-
.../org/apache/solr/cloud/RecoveryStrategy.java | 2 +-
.../org/apache/solr/cloud/SplitShardCmd.java | 36 ++++
.../solr/cloud/overseer/CollectionMutator.java | 8 +
.../solr/cloud/overseer/ReplicaMutator.java | 59 ++++-
.../solr/cloud/overseer/SliceMutator.java | 5 +-
.../org/apache/solr/cloud/ShardSplitTest.java | 215 +++++++++++++++++++
.../org/apache/solr/common/cloud/Slice.java | 11 +-
9 files changed, 326 insertions(+), 15 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1b03c940/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 11b845d..8deaf72 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -92,6 +92,9 @@ Bug Fixes
set at a time. This prevents a bug where BasicAuth and SSL could not be
combined on an HttpClient object. (Alan Woodward)
+* SOLR-9438: Shard split can be marked successful and sub-shard states switched to 'active' even when
+ one or more sub-shards replicas do not recover due to the leader crashing or restarting between the time
+ the replicas are created and before they can recover. This can cause data loss. (shalin)
Optimizations
----------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1b03c940/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java
index 41b74d5..71d9c46 100644
--- a/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java
@@ -79,7 +79,7 @@ public class DeleteShardCmd implements Cmd {
// TODO: Add check for range gaps on Slice deletion
final Slice.State state = slice.getState();
if (!(slice.getRange() == null || state == Slice.State.INACTIVE || state == Slice.State.RECOVERY
- || state == Slice.State.CONSTRUCTION)) {
+ || state == Slice.State.CONSTRUCTION) || state == Slice.State.RECOVERY_FAILED) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "The slice: " + slice.getName() + " is currently " + state
+ ". Only non-active (or custom-hashed) slices can be deleted.");
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1b03c940/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index abd00ae..8256905 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -582,7 +582,7 @@ public class RecoveryStrategy extends Thread implements Closeable {
prepCmd.setCheckLive(true);
prepCmd.setOnlyIfLeader(true);
final Slice.State state = slice.getState();
- if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY) {
+ if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY && state != Slice.State.RECOVERY_FAILED) {
prepCmd.setOnlyIfLeaderActive(true);
}
HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1b03c940/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
index 3361a5f..5a099e1 100644
--- a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
@@ -47,6 +47,7 @@ import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.Utils;
import org.apache.solr.handler.component.ShardHandler;
import org.apache.solr.util.TestInjection;
+import org.apache.zookeeper.data.Stat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -126,6 +127,13 @@ public class SplitShardCmd implements Cmd {
Thread.currentThread().interrupt();
}
+ // let's record the ephemeralOwner of the parent leader node
+ Stat leaderZnodeStat = zkStateReader.getZkClient().exists(ZkStateReader.LIVE_NODES_ZKNODE + "/" + parentShardLeader.getNodeName(), null, true);
+ if (leaderZnodeStat == null) {
+ // we just got to know the leader but its live node is gone already!
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The shard leader node: " + parentShardLeader.getNodeName() + " is not live anymore!");
+ }
+
DocRouter.Range range = parentSlice.getRange();
if (range == null) {
range = new PlainIdRouter().fullRange();
@@ -253,6 +261,8 @@ public class SplitShardCmd implements Cmd {
propMap.put(ZkStateReader.SHARD_RANGE_PROP, subRange.toString());
propMap.put(ZkStateReader.SHARD_STATE_PROP, Slice.State.CONSTRUCTION.toString());
propMap.put(ZkStateReader.SHARD_PARENT_PROP, parentSlice.getName());
+ propMap.put("shard_parent_node", parentShardLeader.getNodeName());
+ propMap.put("shard_parent_zk_session", leaderZnodeStat.getEphemeralOwner());
DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkStateReader.getZkClient());
inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap)));
@@ -420,6 +430,32 @@ public class SplitShardCmd implements Cmd {
assert TestInjection.injectSplitFailureBeforeReplicaCreation();
+ long ephemeralOwner = leaderZnodeStat.getEphemeralOwner();
+ // compare against the ephemeralOwner of the parent leader node
+ leaderZnodeStat = zkStateReader.getZkClient().exists(ZkStateReader.LIVE_NODES_ZKNODE + "/" + parentShardLeader.getNodeName(), null, true);
+ if (leaderZnodeStat == null || ephemeralOwner != leaderZnodeStat.getEphemeralOwner()) {
+ // put sub-shards in recovery_failed state
+ DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkStateReader.getZkClient());
+ Map<String, Object> propMap = new HashMap<>();
+ propMap.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower());
+ for (String subSlice : subSlices) {
+ propMap.put(subSlice, Slice.State.RECOVERY_FAILED.toString());
+ }
+ propMap.put(ZkStateReader.COLLECTION_PROP, collectionName);
+ ZkNodeProps m = new ZkNodeProps(propMap);
+ inQueue.offer(Utils.toJSON(m));
+
+ if (leaderZnodeStat == null) {
+ // the leader is not live anymore, fail the split!
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The shard leader node: " + parentShardLeader.getNodeName() + " is not live anymore!");
+ } else if (ephemeralOwner != leaderZnodeStat.getEphemeralOwner()) {
+ // there's a new leader, fail the split!
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ "The zk session id for the shard leader node: " + parentShardLeader.getNodeName() + " has changed from "
+ + ephemeralOwner + " to " + leaderZnodeStat.getEphemeralOwner() + ". This can cause data loss so we must abort the split");
+ }
+ }
+
// we must set the slice state into recovery before actually creating the replica cores
// this ensures that the logic inside Overseer to update sub-shard state to 'active'
// always gets a chance to execute. See SOLR-7673
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1b03c940/solr/core/src/java/org/apache/solr/cloud/overseer/CollectionMutator.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/CollectionMutator.java b/solr/core/src/java/org/apache/solr/cloud/overseer/CollectionMutator.java
index 4d1ddcd..59629de 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/CollectionMutator.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/CollectionMutator.java
@@ -59,11 +59,19 @@ public class CollectionMutator {
String shardRange = message.getStr(ZkStateReader.SHARD_RANGE_PROP);
String shardState = message.getStr(ZkStateReader.SHARD_STATE_PROP);
String shardParent = message.getStr(ZkStateReader.SHARD_PARENT_PROP);
+ String shardParentZkSession = message.getStr("shard_parent_zk_session");
+ String shardParentNode = message.getStr("shard_parent_node");
sliceProps.put(Slice.RANGE, shardRange);
sliceProps.put(ZkStateReader.STATE_PROP, shardState);
if (shardParent != null) {
sliceProps.put(Slice.PARENT, shardParent);
}
+ if (shardParentZkSession != null) {
+ sliceProps.put("shard_parent_zk_session", shardParentZkSession);
+ }
+ if (shardParentNode != null) {
+ sliceProps.put("shard_parent_node", shardParentNode);
+ }
collection = updateSlice(collectionName, collection, new Slice(shardId, replicas, sliceProps));
return new ZkWriteCommand(collectionName, collection);
} else {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1b03c940/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java b/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
index 495d1d3..97aa555 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
@@ -38,6 +38,7 @@ import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.Utils;
+import org.apache.zookeeper.data.Stat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -403,19 +404,57 @@ public class ReplicaMutator {
}
if (allActive) {
// hurray, all sub shard replicas are active
- log.info("Shard: {} - All replicas across all fellow sub-shards are now ACTIVE. Preparing to switch shard states.", sliceName);
+ log.info("Shard: {} - All replicas across all fellow sub-shards are now ACTIVE.", sliceName);
String parentSliceName = (String) sliceProps.remove(Slice.PARENT);
+ // now lets see if the parent leader is still the same or else there's a chance of data loss
+ // see SOLR-9438 for details
+ String shardParentZkSession = (String) sliceProps.remove("shard_parent_zk_session");
+ String shardParentNode = (String) sliceProps.remove("shard_parent_node");
+ boolean isLeaderSame = true;
+ if (shardParentNode != null && shardParentZkSession != null) {
+ log.info("Checking whether sub-shard leader node is still the same one at {} with ZK session id {}", shardParentNode, shardParentZkSession);
+ try {
+ Stat leaderZnodeStat = zkStateReader.getZkClient().exists(ZkStateReader.LIVE_NODES_ZKNODE
+ + "/" + shardParentNode, null, true);
+ if (leaderZnodeStat == null) {
+ log.error("The shard leader node: {} is not live anymore!", shardParentNode);
+ isLeaderSame = false;
+ } else if (leaderZnodeStat.getEphemeralOwner() != Long.parseLong(shardParentZkSession)) {
+ log.error("The zk session id for shard leader node: {} has changed from {} to {}",
+ shardParentNode, shardParentZkSession, leaderZnodeStat.getEphemeralOwner());
+ isLeaderSame = false;
+ }
+ } catch (Exception e) {
+ log.warn("Error occurred while checking if parent shard node is still live with the same zk session id. " +
+ "We cannot switch shard states at this time.", e);
+ return collection; // we aren't going to make any changes right now
+ }
+ }
- Map<String, Object> propMap = new HashMap<>();
- propMap.put(Overseer.QUEUE_OPERATION, "updateshardstate");
- propMap.put(parentSliceName, Slice.State.INACTIVE.toString());
- propMap.put(sliceName, Slice.State.ACTIVE.toString());
- for (Slice subShardSlice : subShardSlices) {
- propMap.put(subShardSlice.getName(), Slice.State.ACTIVE.toString());
+ if (isLeaderSame) {
+ log.info("Sub-shard leader node is still the same one at {} with ZK session id {}. Preparing to switch shard states.", shardParentNode, shardParentZkSession);
+ Map<String, Object> propMap = new HashMap<>();
+ propMap.put(Overseer.QUEUE_OPERATION, "updateshardstate");
+ propMap.put(parentSliceName, Slice.State.INACTIVE.toString());
+ propMap.put(sliceName, Slice.State.ACTIVE.toString());
+ for (Slice subShardSlice : subShardSlices) {
+ propMap.put(subShardSlice.getName(), Slice.State.ACTIVE.toString());
+ }
+ propMap.put(ZkStateReader.COLLECTION_PROP, collection.getName());
+ ZkNodeProps m = new ZkNodeProps(propMap);
+ return new SliceMutator(zkStateReader).updateShardState(prevState, m).collection;
+ } else {
+ // we must mark the shard split as failed by switching sub-shards to recovery_failed state
+ Map<String, Object> propMap = new HashMap<>();
+ propMap.put(Overseer.QUEUE_OPERATION, "updateshardstate");
+ propMap.put(sliceName, Slice.State.RECOVERY_FAILED.toString());
+ for (Slice subShardSlice : subShardSlices) {
+ propMap.put(subShardSlice.getName(), Slice.State.RECOVERY_FAILED.toString());
+ }
+ propMap.put(ZkStateReader.COLLECTION_PROP, collection.getName());
+ ZkNodeProps m = new ZkNodeProps(propMap);
+ return new SliceMutator(zkStateReader).updateShardState(prevState, m).collection;
}
- propMap.put(ZkStateReader.COLLECTION_PROP, collection.getName());
- ZkNodeProps m = new ZkNodeProps(propMap);
- return new SliceMutator(zkStateReader).updateShardState(prevState, m).collection;
}
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1b03c940/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java b/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java
index 0a9aaa7..8eb323d 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java
@@ -164,9 +164,10 @@ public class SliceMutator {
log.info("Update shard state " + key + " to " + message.getStr(key));
Map<String, Object> props = slice.shallowCopy();
- if (Slice.State.getState((String) props.get(ZkStateReader.STATE_PROP)) == Slice.State.RECOVERY
- && Slice.State.getState(message.getStr(key)) == Slice.State.ACTIVE) {
+ if (Slice.State.getState(message.getStr(key)) == Slice.State.ACTIVE) {
props.remove(Slice.PARENT);
+ props.remove("shard_parent_node");
+ props.remove("shard_parent_zk_session");
}
props.put(ZkStateReader.STATE_PROP, message.getStr(key));
Slice newSlice = new Slice(slice.getName(), slice.getReplicasCopy(), props);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1b03c940/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java
index 0000e7b..64757a6 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java
@@ -27,6 +27,7 @@ import java.util.Random;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.lucene.util.LuceneTestCase.Slow;
@@ -38,8 +39,10 @@ import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.CoreAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
+import org.apache.solr.client.solrj.response.CoreAdminResponse;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.RequestStatusState;
import org.apache.solr.common.SolrDocument;
@@ -297,6 +300,218 @@ public class ShardSplitTest extends BasicDistributedZkTest {
}
@Test
+ public void testSplitWithChaosMonkey() throws Exception {
+ waitForThingsToLevelOut(15);
+
+ List<StoppableIndexingThread> indexers = new ArrayList<>();
+ try {
+ for (int i = 0; i < 1; i++) {
+ StoppableIndexingThread thread = new StoppableIndexingThread(controlClient, cloudClient, String.valueOf(i), true);
+ indexers.add(thread);
+ thread.start();
+ }
+ Thread.sleep(1000); // give the indexers some time to do their work
+ } catch (Exception e) {
+ log.error("Error in test", e);
+ } finally {
+ for (StoppableIndexingThread indexer : indexers) {
+ indexer.safeStop();
+ indexer.join();
+ }
+ }
+
+ cloudClient.commit();
+ controlClient.commit();
+
+ AtomicBoolean stop = new AtomicBoolean();
+ AtomicBoolean killed = new AtomicBoolean(false);
+ Runnable monkey = new Runnable() {
+ @Override
+ public void run() {
+ ZkStateReader zkStateReader = cloudClient.getZkStateReader();
+ zkStateReader.registerCollectionStateWatcher(AbstractDistribZkTestBase.DEFAULT_COLLECTION, new CollectionStateWatcher() {
+ @Override
+ public boolean onStateChanged(Set<String> liveNodes, DocCollection collectionState) {
+ if (stop.get()) {
+ return true; // abort and remove the watch
+ }
+ Slice slice = collectionState.getSlice(SHARD1_0);
+ if (slice != null && slice.getReplicas().size() > 1) {
+ // ensure that only one watcher invocation thread can kill!
+ if (killed.compareAndSet(false, true)) {
+ log.info("Monkey thread found 2 replicas for {} {}", AbstractDistribZkTestBase.DEFAULT_COLLECTION, SHARD1);
+ CloudJettyRunner cjetty = shardToLeaderJetty.get(SHARD1);
+ try {
+ Thread.sleep(1000 + random().nextInt(500));
+ ChaosMonkey.kill(cjetty);
+ stop.set(true);
+ return true;
+ } catch (Exception e) {
+ log.error("Monkey unable to kill jetty at port " + cjetty.jetty.getLocalPort(), e);
+ }
+ }
+ }
+ log.info("Monkey thread found only one replica for {} {}", AbstractDistribZkTestBase.DEFAULT_COLLECTION, SHARD1);
+ return false;
+ }
+ });
+ }
+ };
+
+ Thread monkeyThread = null;
+ /*
+ somehow the cluster state object inside this zk state reader has static copy of the collection which is never updated
+ so any call to waitForRecoveriesToFinish just keeps looping until timeout.
+ We workaround by explicitly registering the collection as an interesting one so that it is watched by ZkStateReader
+ see SOLR-9440. Todo remove this hack after SOLR-9440 is fixed.
+ */
+ cloudClient.getZkStateReader().registerCore(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+
+ monkeyThread = new Thread(monkey);
+ monkeyThread.start();
+ try {
+ CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+ splitShard.setShardName(SHARD1);
+ String asyncId = splitShard.processAsync(cloudClient);
+ RequestStatusState splitStatus = null;
+ try {
+ splitStatus = CollectionAdminRequest.requestStatus(asyncId).waitFor(cloudClient, 120);
+ } catch (Exception e) {
+ log.warn("Failed to get request status, maybe because the overseer node was shutdown by monkey", e);
+ }
+
+ // we don't care if the split failed because we are injecting faults and it is likely
+ // that the split has failed but in any case we want to assert that all docs that got
+ // indexed are available in SolrCloud and if the split succeeded then all replicas of the sub-shard
+ // must be consistent (i.e. have same numdocs)
+
+ log.info("Shard split request state is COMPLETED");
+ stop.set(true);
+ monkeyThread.join();
+ Set<String> addFails = new HashSet<>();
+ Set<String> deleteFails = new HashSet<>();
+ for (StoppableIndexingThread indexer : indexers) {
+ addFails.addAll(indexer.getAddFails());
+ deleteFails.addAll(indexer.getDeleteFails());
+ }
+
+ CloudJettyRunner cjetty = shardToLeaderJetty.get(SHARD1);
+ log.info("Starting shard1 leader jetty at port {}", cjetty.jetty.getLocalPort());
+ ChaosMonkey.start(cjetty.jetty);
+ cloudClient.getZkStateReader().forceUpdateCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+ log.info("Current collection state: {}", printClusterStateInfo(AbstractDistribZkTestBase.DEFAULT_COLLECTION));
+
+ boolean replicaCreationsFailed = false;
+ if (splitStatus == RequestStatusState.FAILED) {
+ // either one or more replica creation failed (because it may have been created on the same parent shard leader node)
+ // or the split may have failed while trying to soft-commit *after* all replicas have been created
+ // the latter counts as a successful switch even if the API doesn't say so
+ // so we must find a way to distinguish between the two
+ // an easy way to do that is to look at the sub-shard replicas and check if the replica core actually exists
+ // instead of existing solely inside the cluster state
+ DocCollection collectionState = cloudClient.getZkStateReader().getClusterState().getCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+ Slice slice10 = collectionState.getSlice(SHARD1_0);
+ Slice slice11 = collectionState.getSlice(SHARD1_1);
+ if (slice10 != null && slice11 != null) {
+ for (Replica replica : slice10) {
+ if (!doesReplicaCoreExist(replica)) {
+ replicaCreationsFailed = true;
+ break;
+ }
+ }
+ for (Replica replica : slice11) {
+ if (!doesReplicaCoreExist(replica)) {
+ replicaCreationsFailed = true;
+ break;
+ }
+ }
+ }
+ }
+
+ // true if sub-shard states switch to 'active' eventually
+ AtomicBoolean areSubShardsActive = new AtomicBoolean(false);
+
+ if (!replicaCreationsFailed) {
+ // all sub-shard replicas were created successfully so all cores must recover eventually
+ waitForRecoveriesToFinish(AbstractDistribZkTestBase.DEFAULT_COLLECTION, true);
+ // let's wait for the overseer to switch shard states
+ CountDownLatch latch = new CountDownLatch(1);
+ cloudClient.getZkStateReader().registerCollectionStateWatcher(AbstractDistribZkTestBase.DEFAULT_COLLECTION, new CollectionStateWatcher() {
+ @Override
+ public boolean onStateChanged(Set<String> liveNodes, DocCollection collectionState) {
+ Slice parent = collectionState.getSlice(SHARD1);
+ Slice slice10 = collectionState.getSlice(SHARD1_0);
+ Slice slice11 = collectionState.getSlice(SHARD1_1);
+ if (slice10 != null && slice11 != null &&
+ parent.getState() == Slice.State.INACTIVE &&
+ slice10.getState() == Slice.State.ACTIVE &&
+ slice11.getState() == Slice.State.ACTIVE) {
+ areSubShardsActive.set(true);
+ latch.countDown();
+ return true; // removes the watch
+ } else if (slice10 != null && slice11 != null &&
+ parent.getState() == Slice.State.ACTIVE &&
+ slice10.getState() == Slice.State.RECOVERY_FAILED &&
+ slice11.getState() == Slice.State.RECOVERY_FAILED) {
+ areSubShardsActive.set(false);
+ latch.countDown();
+ return true;
+ }
+ return false;
+ }
+ });
+
+ latch.await(2, TimeUnit.MINUTES);
+
+ if (latch.getCount() != 0) {
+ // sanity check
+ fail("We think that split was successful but sub-shard states were not updated even after 2 minutes.");
+ }
+ }
+
+ cloudClient.commit(); // for visibility of results on sub-shards
+
+ checkShardConsistency(true, true, addFails, deleteFails);
+ long ctrlDocs = controlClient.query(new SolrQuery("*:*")).getResults().getNumFound();
+ // ensure we have added more than 0 docs
+ long cloudClientDocs = cloudClient.query(new SolrQuery("*:*")).getResults().getNumFound();
+ assertTrue("Found " + ctrlDocs + " control docs", cloudClientDocs > 0);
+ assertEquals("Found " + ctrlDocs + " control docs and " + cloudClientDocs + " cloud docs", ctrlDocs, cloudClientDocs);
+
+ // check consistency of sub-shard replica explicitly because checkShardConsistency methods doesn't
+ // handle new shards/replica so well.
+ if (areSubShardsActive.get()) {
+ ClusterState clusterState = cloudClient.getZkStateReader().getClusterState();
+ DocCollection collection = clusterState.getCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+ int numReplicasChecked = assertConsistentReplicas(collection.getSlice(SHARD1_0));
+ assertEquals("We should have checked consistency for exactly 2 replicas of shard1_0", 2, numReplicasChecked);
+ numReplicasChecked = assertConsistentReplicas(collection.getSlice(SHARD1_1));
+ assertEquals("We should have checked consistency for exactly 2 replicas of shard1_1", 2, numReplicasChecked);
+ }
+ } finally {
+ stop.set(true);
+ monkeyThread.join();
+ }
+ }
+
+ private boolean doesReplicaCoreExist(Replica replica) throws IOException {
+ try (HttpSolrClient client = new HttpSolrClient.Builder(replica.getStr(BASE_URL_PROP))
+ .withHttpClient(cloudClient.getLbClient().getHttpClient()).build()) {
+ String coreName = replica.getCoreName();
+ try {
+ CoreAdminResponse status = CoreAdminRequest.getStatus(coreName, client);
+ if (status.getCoreStatus(coreName) == null || status.getCoreStatus(coreName).size() == 0) {
+ return false;
+ }
+ } catch (Exception e) {
+ log.warn("Error gettting core status of replica " + replica + ". Perhaps it does not exist!", e);
+ return false;
+ }
+ }
+ return true;
+ }
+
+ @Test
public void testSplitShardWithRule() throws Exception {
waitForThingsToLevelOut(15);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1b03c940/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java b/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java
index 3ace17a..e4be009 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java
@@ -81,7 +81,16 @@ public class Slice extends ZkNodeProps implements Iterable<Replica> {
* shard in that state still receives update requests from the parent shard
* leader, however does not participate in distributed search.
*/
- RECOVERY;
+ RECOVERY,
+
+ /**
+ * Sub-shards of a split shard are put in that state when the split is deemed failed
+ * by the overseer even though all replicas are active because either the leader node is
+ * no longer live or has a different ephemeral owner (zk session id). Such conditions can potentially
+ * lead to data loss. See SOLR-9438 for details. A shard in that state will neither receive
+ * update requests from the parent shard leader, nor participate in distributed search.
+ */
+ RECOVERY_FAILED;
@Override
public String toString() {