You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2016/09/12 11:51:25 UTC
lucene-solr:branch_6_2: SOLR-9438: Shard split can be marked successful and sub-shard states switched to 'active' even when one or more sub-shards replicas do not recover due to the leader crashing or restarting between the time the replicas are created

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6_2 84dfd1937 -> 8027eb980


SOLR-9438: Shard split can be marked successful and sub-shard states switched to 'active' even when one or more sub-shards replicas do not recover due to the leader crashing or restarting between the time the replicas are created and before they can recover

(cherry picked from commit f177a66)

(cherry picked from commit 1b03c94)


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/8027eb98
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/8027eb98
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/8027eb98

Branch: refs/heads/branch_6_2
Commit: 8027eb9803c2248a427f45c3771d1cb88d57f4b1
Parents: 84dfd19
Author: Shalin Shekhar Mangar <sh...@apache.org>
Authored: Mon Sep 12 17:00:46 2016 +0530
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Mon Sep 12 17:21:12 2016 +0530

----------------------------------------------------------------------
 solr/CHANGES.txt                                |   4 +
 .../org/apache/solr/cloud/DeleteShardCmd.java   |   2 +-
 .../org/apache/solr/cloud/RecoveryStrategy.java |   2 +-
 .../org/apache/solr/cloud/SplitShardCmd.java    |  36 ++++
 .../solr/cloud/overseer/CollectionMutator.java  |   8 +
 .../solr/cloud/overseer/ReplicaMutator.java     |  59 ++++-
 .../solr/cloud/overseer/SliceMutator.java       |   5 +-
 .../org/apache/solr/cloud/ShardSplitTest.java   | 215 +++++++++++++++++++
 .../org/apache/solr/common/cloud/Slice.java     |  11 +-
 9 files changed, 327 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8027eb98/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 3c3038b..0f76319 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -54,6 +54,10 @@ Bug Fixes
 
 * SOLR-9490: Fixed bugs in BoolField that caused it to erroneously return "false" for all docs depending
   on usage (Colvin Cowie, Dan Fox, hossman)
+
+* SOLR-9438: Shard split can be marked successful and sub-shard states switched to 'active' even when
+  one or more sub-shards replicas do not recover due to the leader crashing or restarting between the time
+  the replicas are created and before they can recover. This can cause data loss. (shalin)
 ==================  6.2.0 ==================
 
 Versions of Major Components

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8027eb98/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java
index a7f6d5b..64653a7 100644
--- a/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/DeleteShardCmd.java
@@ -70,7 +70,7 @@ public class DeleteShardCmd implements Cmd {
     // TODO: Add check for range gaps on Slice deletion
     final Slice.State state = slice.getState();
     if (!(slice.getRange() == null || state == Slice.State.INACTIVE || state == Slice.State.RECOVERY
-        || state == Slice.State.CONSTRUCTION)) {
+        || state == Slice.State.CONSTRUCTION) || state == Slice.State.RECOVERY_FAILED) {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "The slice: " + slice.getName() + " is currently " + state
           + ". Only non-active (or custom-hashed) slices can be deleted.");
     }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8027eb98/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index abd00ae..8256905 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -582,7 +582,7 @@ public class RecoveryStrategy extends Thread implements Closeable {
       prepCmd.setCheckLive(true);
       prepCmd.setOnlyIfLeader(true);
       final Slice.State state = slice.getState();
-      if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY) {
+      if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY && state != Slice.State.RECOVERY_FAILED) {
         prepCmd.setOnlyIfLeaderActive(true);
       }
       HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8027eb98/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
index 4463285..23622c1 100644
--- a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
@@ -47,6 +47,7 @@ import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.Utils;
 import org.apache.solr.handler.component.ShardHandler;
 import org.apache.solr.util.TestInjection;
+import org.apache.zookeeper.data.Stat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -126,6 +127,13 @@ public class SplitShardCmd implements Cmd {
       Thread.currentThread().interrupt();
     }
 
+    // let's record the ephemeralOwner of the parent leader node
+    Stat leaderZnodeStat = zkStateReader.getZkClient().exists(ZkStateReader.LIVE_NODES_ZKNODE + "/" + parentShardLeader.getNodeName(), null, true);
+    if (leaderZnodeStat == null)  {
+      // we just got to know the leader but its live node is gone already!
+      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The shard leader node: " + parentShardLeader.getNodeName() + " is not live anymore!");
+    }
+
     DocRouter.Range range = parentSlice.getRange();
     if (range == null) {
       range = new PlainIdRouter().fullRange();
@@ -255,6 +263,8 @@ public class SplitShardCmd implements Cmd {
         propMap.put(ZkStateReader.SHARD_RANGE_PROP, subRange.toString());
         propMap.put(ZkStateReader.SHARD_STATE_PROP, Slice.State.CONSTRUCTION.toString());
         propMap.put(ZkStateReader.SHARD_PARENT_PROP, parentSlice.getName());
+        propMap.put("shard_parent_node", parentShardLeader.getNodeName());
+        propMap.put("shard_parent_zk_session", leaderZnodeStat.getEphemeralOwner());
         DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkStateReader.getZkClient());
         inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap)));
 
@@ -422,6 +432,32 @@ public class SplitShardCmd implements Cmd {
 
       assert TestInjection.injectSplitFailureBeforeReplicaCreation();
 
+      long ephemeralOwner = leaderZnodeStat.getEphemeralOwner();
+      // compare against the ephemeralOwner of the parent leader node
+      leaderZnodeStat = zkStateReader.getZkClient().exists(ZkStateReader.LIVE_NODES_ZKNODE + "/" + parentShardLeader.getNodeName(), null, true);
+      if (leaderZnodeStat == null || ephemeralOwner != leaderZnodeStat.getEphemeralOwner()) {
+        // put sub-shards in recovery_failed state
+        DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkStateReader.getZkClient());
+        Map<String, Object> propMap = new HashMap<>();
+        propMap.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower());
+        for (String subSlice : subSlices) {
+          propMap.put(subSlice, Slice.State.RECOVERY_FAILED.toString());
+        }
+        propMap.put(ZkStateReader.COLLECTION_PROP, collectionName);
+        ZkNodeProps m = new ZkNodeProps(propMap);
+        inQueue.offer(Utils.toJSON(m));
+
+        if (leaderZnodeStat == null)  {
+          // the leader is not live anymore, fail the split!
+          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The shard leader node: " + parentShardLeader.getNodeName() + " is not live anymore!");
+        } else if (ephemeralOwner != leaderZnodeStat.getEphemeralOwner()) {
+          // there's a new leader, fail the split!
+          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+              "The zk session id for the shard leader node: " + parentShardLeader.getNodeName() + " has changed from "
+                  + ephemeralOwner + " to " + leaderZnodeStat.getEphemeralOwner() + ". This can cause data loss so we must abort the split");
+        }
+      }
+
       // we must set the slice state into recovery before actually creating the replica cores
       // this ensures that the logic inside Overseer to update sub-shard state to 'active'
       // always gets a chance to execute. See SOLR-7673

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8027eb98/solr/core/src/java/org/apache/solr/cloud/overseer/CollectionMutator.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/CollectionMutator.java b/solr/core/src/java/org/apache/solr/cloud/overseer/CollectionMutator.java
index 4d1ddcd..59629de 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/CollectionMutator.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/CollectionMutator.java
@@ -59,11 +59,19 @@ public class CollectionMutator {
       String shardRange = message.getStr(ZkStateReader.SHARD_RANGE_PROP);
       String shardState = message.getStr(ZkStateReader.SHARD_STATE_PROP);
       String shardParent = message.getStr(ZkStateReader.SHARD_PARENT_PROP);
+      String shardParentZkSession = message.getStr("shard_parent_zk_session");
+      String shardParentNode = message.getStr("shard_parent_node");
       sliceProps.put(Slice.RANGE, shardRange);
       sliceProps.put(ZkStateReader.STATE_PROP, shardState);
       if (shardParent != null) {
         sliceProps.put(Slice.PARENT, shardParent);
       }
+      if (shardParentZkSession != null) {
+        sliceProps.put("shard_parent_zk_session", shardParentZkSession);
+      }
+      if (shardParentNode != null)  {
+        sliceProps.put("shard_parent_node", shardParentNode);
+      }
       collection = updateSlice(collectionName, collection, new Slice(shardId, replicas, sliceProps));
       return new ZkWriteCommand(collectionName, collection);
     } else {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8027eb98/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java b/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
index 495d1d3..97aa555 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
@@ -38,6 +38,7 @@ import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.Utils;
+import org.apache.zookeeper.data.Stat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -403,19 +404,57 @@ public class ReplicaMutator {
           }
           if (allActive) {
             // hurray, all sub shard replicas are active
-            log.info("Shard: {} - All replicas across all fellow sub-shards are now ACTIVE. Preparing to switch shard states.", sliceName);
+            log.info("Shard: {} - All replicas across all fellow sub-shards are now ACTIVE.", sliceName);
             String parentSliceName = (String) sliceProps.remove(Slice.PARENT);
+            // now lets see if the parent leader is still the same or else there's a chance of data loss
+            // see SOLR-9438 for details
+            String shardParentZkSession  = (String) sliceProps.remove("shard_parent_zk_session");
+            String shardParentNode = (String) sliceProps.remove("shard_parent_node");
+            boolean isLeaderSame = true;
+            if (shardParentNode != null && shardParentZkSession != null)  {
+              log.info("Checking whether sub-shard leader node is still the same one at {} with ZK session id {}", shardParentNode, shardParentZkSession);
+              try {
+                Stat leaderZnodeStat = zkStateReader.getZkClient().exists(ZkStateReader.LIVE_NODES_ZKNODE
+                    + "/" + shardParentNode, null, true);
+                if (leaderZnodeStat == null)  {
+                  log.error("The shard leader node: {} is not live anymore!", shardParentNode);
+                  isLeaderSame = false;
+                } else if (leaderZnodeStat.getEphemeralOwner() != Long.parseLong(shardParentZkSession))  {
+                  log.error("The zk session id for shard leader node: {} has changed from {} to {}",
+                      shardParentNode, shardParentZkSession, leaderZnodeStat.getEphemeralOwner());
+                  isLeaderSame = false;
+                }
+              } catch (Exception e) {
+                log.warn("Error occurred while checking if parent shard node is still live with the same zk session id. " +
+                    "We cannot switch shard states at this time.", e);
+                return collection; // we aren't going to make any changes right now
+              }
+            }
 
-            Map<String, Object> propMap = new HashMap<>();
-            propMap.put(Overseer.QUEUE_OPERATION, "updateshardstate");
-            propMap.put(parentSliceName, Slice.State.INACTIVE.toString());
-            propMap.put(sliceName, Slice.State.ACTIVE.toString());
-            for (Slice subShardSlice : subShardSlices) {
-              propMap.put(subShardSlice.getName(), Slice.State.ACTIVE.toString());
+            if (isLeaderSame) {
+              log.info("Sub-shard leader node is still the same one at {} with ZK session id {}. Preparing to switch shard states.", shardParentNode, shardParentZkSession);
+              Map<String, Object> propMap = new HashMap<>();
+              propMap.put(Overseer.QUEUE_OPERATION, "updateshardstate");
+              propMap.put(parentSliceName, Slice.State.INACTIVE.toString());
+              propMap.put(sliceName, Slice.State.ACTIVE.toString());
+              for (Slice subShardSlice : subShardSlices) {
+                propMap.put(subShardSlice.getName(), Slice.State.ACTIVE.toString());
+              }
+              propMap.put(ZkStateReader.COLLECTION_PROP, collection.getName());
+              ZkNodeProps m = new ZkNodeProps(propMap);
+              return new SliceMutator(zkStateReader).updateShardState(prevState, m).collection;
+            } else  {
+              // we must mark the shard split as failed by switching sub-shards to recovery_failed state
+              Map<String, Object> propMap = new HashMap<>();
+              propMap.put(Overseer.QUEUE_OPERATION, "updateshardstate");
+              propMap.put(sliceName, Slice.State.RECOVERY_FAILED.toString());
+              for (Slice subShardSlice : subShardSlices) {
+                propMap.put(subShardSlice.getName(), Slice.State.RECOVERY_FAILED.toString());
+              }
+              propMap.put(ZkStateReader.COLLECTION_PROP, collection.getName());
+              ZkNodeProps m = new ZkNodeProps(propMap);
+              return new SliceMutator(zkStateReader).updateShardState(prevState, m).collection;
             }
-            propMap.put(ZkStateReader.COLLECTION_PROP, collection.getName());
-            ZkNodeProps m = new ZkNodeProps(propMap);
-            return new SliceMutator(zkStateReader).updateShardState(prevState, m).collection;
           }
         }
       }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8027eb98/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java b/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java
index 0a9aaa7..8eb323d 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java
@@ -164,9 +164,10 @@ public class SliceMutator {
       log.info("Update shard state " + key + " to " + message.getStr(key));
       Map<String, Object> props = slice.shallowCopy();
       
-      if (Slice.State.getState((String) props.get(ZkStateReader.STATE_PROP)) == Slice.State.RECOVERY
-          && Slice.State.getState(message.getStr(key)) == Slice.State.ACTIVE) {
+      if (Slice.State.getState(message.getStr(key)) == Slice.State.ACTIVE) {
         props.remove(Slice.PARENT);
+        props.remove("shard_parent_node");
+        props.remove("shard_parent_zk_session");
       }
       props.put(ZkStateReader.STATE_PROP, message.getStr(key));
       Slice newSlice = new Slice(slice.getName(), slice.getReplicasCopy(), props);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8027eb98/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java
index 0000e7b..64757a6 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java
@@ -27,6 +27,7 @@ import java.util.Random;
 import java.util.Set;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import org.apache.http.params.CoreConnectionPNames;
 import org.apache.lucene.util.LuceneTestCase.Slow;
@@ -38,8 +39,10 @@ import org.apache.solr.client.solrj.embedded.JettySolrRunner;
 import org.apache.solr.client.solrj.impl.CloudSolrClient;
 import org.apache.solr.client.solrj.impl.HttpSolrClient;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.CoreAdminRequest;
 import org.apache.solr.client.solrj.request.QueryRequest;
 import org.apache.solr.client.solrj.response.CollectionAdminResponse;
+import org.apache.solr.client.solrj.response.CoreAdminResponse;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.client.solrj.response.RequestStatusState;
 import org.apache.solr.common.SolrDocument;
@@ -297,6 +300,218 @@ public class ShardSplitTest extends BasicDistributedZkTest {
   }
 
   @Test
+  public void testSplitWithChaosMonkey() throws Exception {
+    waitForThingsToLevelOut(15);
+
+    List<StoppableIndexingThread> indexers = new ArrayList<>();
+    try {
+      for (int i = 0; i < 1; i++) {
+        StoppableIndexingThread thread = new StoppableIndexingThread(controlClient, cloudClient, String.valueOf(i), true);
+        indexers.add(thread);
+        thread.start();
+      }
+      Thread.sleep(1000); // give the indexers some time to do their work
+    } catch (Exception e) {
+      log.error("Error in test", e);
+    } finally {
+      for (StoppableIndexingThread indexer : indexers) {
+        indexer.safeStop();
+        indexer.join();
+      }
+    }
+
+    cloudClient.commit();
+    controlClient.commit();
+
+    AtomicBoolean stop = new AtomicBoolean();
+    AtomicBoolean killed = new AtomicBoolean(false);
+    Runnable monkey = new Runnable() {
+      @Override
+      public void run() {
+        ZkStateReader zkStateReader = cloudClient.getZkStateReader();
+        zkStateReader.registerCollectionStateWatcher(AbstractDistribZkTestBase.DEFAULT_COLLECTION, new CollectionStateWatcher() {
+          @Override
+          public boolean onStateChanged(Set<String> liveNodes, DocCollection collectionState) {
+            if (stop.get()) {
+              return true; // abort and remove the watch
+            }
+            Slice slice = collectionState.getSlice(SHARD1_0);
+            if (slice != null && slice.getReplicas().size() > 1) {
+              // ensure that only one watcher invocation thread can kill!
+              if (killed.compareAndSet(false, true))  {
+                log.info("Monkey thread found 2 replicas for {} {}", AbstractDistribZkTestBase.DEFAULT_COLLECTION, SHARD1);
+                CloudJettyRunner cjetty = shardToLeaderJetty.get(SHARD1);
+                try {
+                  Thread.sleep(1000 + random().nextInt(500));
+                  ChaosMonkey.kill(cjetty);
+                  stop.set(true);
+                  return true;
+                } catch (Exception e) {
+                  log.error("Monkey unable to kill jetty at port " + cjetty.jetty.getLocalPort(), e);
+                }
+              }
+            }
+            log.info("Monkey thread found only one replica for {} {}", AbstractDistribZkTestBase.DEFAULT_COLLECTION, SHARD1);
+            return false;
+          }
+        });
+      }
+    };
+
+    Thread monkeyThread = null;
+    /*
+     somehow the cluster state object inside this zk state reader has static copy of the collection which is never updated
+     so any call to waitForRecoveriesToFinish just keeps looping until timeout.
+     We workaround by explicitly registering the collection as an interesting one so that it is watched by ZkStateReader
+     see SOLR-9440. Todo remove this hack after SOLR-9440 is fixed.
+    */
+    cloudClient.getZkStateReader().registerCore(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+
+    monkeyThread = new Thread(monkey);
+    monkeyThread.start();
+    try {
+      CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+      splitShard.setShardName(SHARD1);
+      String asyncId = splitShard.processAsync(cloudClient);
+      RequestStatusState splitStatus = null;
+      try {
+        splitStatus = CollectionAdminRequest.requestStatus(asyncId).waitFor(cloudClient, 120);
+      } catch (Exception e) {
+        log.warn("Failed to get request status, maybe because the overseer node was shutdown by monkey", e);
+      }
+
+      // we don't care if the split failed because we are injecting faults and it is likely
+      // that the split has failed but in any case we want to assert that all docs that got
+      // indexed are available in SolrCloud and if the split succeeded then all replicas of the sub-shard
+      // must be consistent (i.e. have same numdocs)
+
+      log.info("Shard split request state is COMPLETED");
+      stop.set(true);
+      monkeyThread.join();
+      Set<String> addFails = new HashSet<>();
+      Set<String> deleteFails = new HashSet<>();
+      for (StoppableIndexingThread indexer : indexers) {
+        addFails.addAll(indexer.getAddFails());
+        deleteFails.addAll(indexer.getDeleteFails());
+      }
+
+      CloudJettyRunner cjetty = shardToLeaderJetty.get(SHARD1);
+      log.info("Starting shard1 leader jetty at port {}", cjetty.jetty.getLocalPort());
+      ChaosMonkey.start(cjetty.jetty);
+      cloudClient.getZkStateReader().forceUpdateCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+      log.info("Current collection state: {}", printClusterStateInfo(AbstractDistribZkTestBase.DEFAULT_COLLECTION));
+
+      boolean replicaCreationsFailed = false;
+      if (splitStatus == RequestStatusState.FAILED)  {
+        // either one or more replica creation failed (because it may have been created on the same parent shard leader node)
+        // or the split may have failed while trying to soft-commit *after* all replicas have been created
+        // the latter counts as a successful switch even if the API doesn't say so
+        // so we must find a way to distinguish between the two
+        // an easy way to do that is to look at the sub-shard replicas and check if the replica core actually exists
+        // instead of existing solely inside the cluster state
+        DocCollection collectionState = cloudClient.getZkStateReader().getClusterState().getCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+        Slice slice10 = collectionState.getSlice(SHARD1_0);
+        Slice slice11 = collectionState.getSlice(SHARD1_1);
+        if (slice10 != null && slice11 != null) {
+          for (Replica replica : slice10) {
+            if (!doesReplicaCoreExist(replica)) {
+              replicaCreationsFailed = true;
+              break;
+            }
+          }
+          for (Replica replica : slice11) {
+            if (!doesReplicaCoreExist(replica)) {
+              replicaCreationsFailed = true;
+              break;
+            }
+          }
+        }
+      }
+
+      // true if sub-shard states switch to 'active' eventually
+      AtomicBoolean areSubShardsActive = new AtomicBoolean(false);
+
+      if (!replicaCreationsFailed)  {
+        // all sub-shard replicas were created successfully so all cores must recover eventually
+        waitForRecoveriesToFinish(AbstractDistribZkTestBase.DEFAULT_COLLECTION, true);
+        // let's wait for the overseer to switch shard states
+        CountDownLatch latch = new CountDownLatch(1);
+        cloudClient.getZkStateReader().registerCollectionStateWatcher(AbstractDistribZkTestBase.DEFAULT_COLLECTION, new CollectionStateWatcher() {
+          @Override
+          public boolean onStateChanged(Set<String> liveNodes, DocCollection collectionState) {
+            Slice parent = collectionState.getSlice(SHARD1);
+            Slice slice10 = collectionState.getSlice(SHARD1_0);
+            Slice slice11 = collectionState.getSlice(SHARD1_1);
+            if (slice10 != null && slice11 != null &&
+                parent.getState() == Slice.State.INACTIVE &&
+                slice10.getState() == Slice.State.ACTIVE &&
+                slice11.getState() == Slice.State.ACTIVE) {
+              areSubShardsActive.set(true);
+              latch.countDown();
+              return true; // removes the watch
+            } else if (slice10 != null && slice11 != null &&
+                parent.getState() == Slice.State.ACTIVE &&
+                slice10.getState() == Slice.State.RECOVERY_FAILED &&
+                slice11.getState() == Slice.State.RECOVERY_FAILED) {
+              areSubShardsActive.set(false);
+              latch.countDown();
+              return true;
+            }
+            return false;
+          }
+        });
+
+        latch.await(2, TimeUnit.MINUTES);
+
+        if (latch.getCount() != 0)  {
+          // sanity check
+          fail("We think that split was successful but sub-shard states were not updated even after 2 minutes.");
+        }
+      }
+
+      cloudClient.commit(); // for visibility of results on sub-shards
+
+      checkShardConsistency(true, true, addFails, deleteFails);
+      long ctrlDocs = controlClient.query(new SolrQuery("*:*")).getResults().getNumFound();
+      // ensure we have added more than 0 docs
+      long cloudClientDocs = cloudClient.query(new SolrQuery("*:*")).getResults().getNumFound();
+      assertTrue("Found " + ctrlDocs + " control docs", cloudClientDocs > 0);
+      assertEquals("Found " + ctrlDocs + " control docs and " + cloudClientDocs + " cloud docs", ctrlDocs, cloudClientDocs);
+
+      // check consistency of sub-shard replica explicitly because checkShardConsistency methods doesn't
+      // handle new shards/replica so well.
+      if (areSubShardsActive.get()) {
+        ClusterState clusterState = cloudClient.getZkStateReader().getClusterState();
+        DocCollection collection = clusterState.getCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+        int numReplicasChecked = assertConsistentReplicas(collection.getSlice(SHARD1_0));
+        assertEquals("We should have checked consistency for exactly 2 replicas of shard1_0", 2, numReplicasChecked);
+        numReplicasChecked = assertConsistentReplicas(collection.getSlice(SHARD1_1));
+        assertEquals("We should have checked consistency for exactly 2 replicas of shard1_1", 2, numReplicasChecked);
+      }
+    } finally {
+      stop.set(true);
+      monkeyThread.join();
+    }
+  }
+
+  private boolean doesReplicaCoreExist(Replica replica) throws IOException {
+    try (HttpSolrClient client = new HttpSolrClient.Builder(replica.getStr(BASE_URL_PROP))
+        .withHttpClient(cloudClient.getLbClient().getHttpClient()).build())  {
+      String coreName = replica.getCoreName();
+      try {
+        CoreAdminResponse status = CoreAdminRequest.getStatus(coreName, client);
+        if (status.getCoreStatus(coreName) == null || status.getCoreStatus(coreName).size() == 0) {
+          return false;
+        }
+      } catch (Exception e) {
+        log.warn("Error gettting core status of replica " + replica + ". Perhaps it does not exist!", e);
+        return false;
+      }
+    }
+    return true;
+  }
+
+  @Test
   public void testSplitShardWithRule() throws Exception {
     waitForThingsToLevelOut(15);
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8027eb98/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java b/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java
index 3ace17a..e4be009 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java
@@ -81,7 +81,16 @@ public class Slice extends ZkNodeProps implements Iterable<Replica> {
      * shard in that state still receives update requests from the parent shard
      * leader, however does not participate in distributed search.
      */
-    RECOVERY;
+    RECOVERY,
+
+    /**
+     * Sub-shards of a split shard are put in that state when the split is deemed failed
+     * by the overseer even though all replicas are active because either the leader node is
+     * no longer live or has a different ephemeral owner (zk session id). Such conditions can potentially
+     * lead to data loss. See SOLR-9438 for details. A shard in that state will neither receive
+     * update requests from the parent shard leader, nor participate in distributed search.
+     */
+    RECOVERY_FAILED;
     
     @Override
     public String toString() {