You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2016/08/27 03:41:50 UTC

[1/2] lucene-solr:branch_6x: SOLR-9445: Admin requests are retried by CloudSolrClient and LBHttpSolrClient on failure (cherry picked from commit ae40929)

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x 43d03430b -> 97b62160e


SOLR-9445: Admin requests are retried by CloudSolrClient and LBHttpSolrClient on failure
(cherry picked from commit ae40929)


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/5556a9b4
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/5556a9b4
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/5556a9b4

Branch: refs/heads/branch_6x
Commit: 5556a9b4def7a4320dcd537ae2cd1ed20d341f03
Parents: 43d0343
Author: Shalin Shekhar Mangar <sh...@apache.org>
Authored: Sat Aug 27 09:08:02 2016 +0530
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Sat Aug 27 09:10:29 2016 +0530

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  2 +
 .../solr/client/solrj/impl/CloudSolrClient.java | 13 +---
 .../client/solrj/impl/LBHttpSolrClient.java     | 20 +++---
 .../apache/solr/common/params/CommonParams.java | 10 +++
 .../client/solrj/impl/CloudSolrClientTest.java  | 68 ++++++++++++++++++--
 5 files changed, 87 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5556a9b4/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 2f17f7b..325c5dc 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -43,6 +43,8 @@ Bug Fixes
 * SOLR-6744: fl renaming / alias of uniqueKey field generates null pointer exception in SolrCloud configuration
   (Mike Drob via Tom�s Fern�ndez L�bbe)
 
+* SOLR-9445: Admin requests are retried by CloudSolrClient and LBHttpSolrClient on failure. (shalin)
+
 Optimizations
 ----------------------
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5556a9b4/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
index 4bce970..cf2b5a7 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
@@ -22,7 +22,6 @@ import java.net.ConnectException;
 import java.net.SocketException;
 import java.nio.file.Path;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
@@ -85,11 +84,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.slf4j.MDC;
 
-import static org.apache.solr.common.params.CommonParams.AUTHC_PATH;
-import static org.apache.solr.common.params.CommonParams.AUTHZ_PATH;
-import static org.apache.solr.common.params.CommonParams.COLLECTIONS_HANDLER_PATH;
-import static org.apache.solr.common.params.CommonParams.CONFIGSETS_HANDLER_PATH;
-import static org.apache.solr.common.params.CommonParams.CORES_HANDLER_PATH;
+import static org.apache.solr.common.params.CommonParams.ADMIN_PATHS;
 
 /**
  * SolrJ client class to communicate with SolrCloud.
@@ -996,12 +991,6 @@ public class CloudSolrClient extends SolrClient {
       collection = (reqParams != null) ? reqParams.get("collection", getDefaultCollection()) : getDefaultCollection();
     return requestWithRetryOnStaleState(request, 0, collection);
   }
-  private static final Set<String> ADMIN_PATHS = new HashSet<>(Arrays.asList(
-      CORES_HANDLER_PATH,
-      COLLECTIONS_HANDLER_PATH,
-      CONFIGSETS_HANDLER_PATH,
-      AUTHC_PATH,
-      AUTHZ_PATH));
 
   /**
    * As this class doesn't watch external collections on the client side,

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5556a9b4/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttpSolrClient.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttpSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttpSolrClient.java
index 9b7d3fe..9daa408 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttpSolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttpSolrClient.java
@@ -53,6 +53,8 @@ import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SolrjNamedThreadFactory;
 import org.slf4j.MDC;
 
+import static org.apache.solr.common.params.CommonParams.ADMIN_PATHS;
+
 /**
  * LBHttpSolrClient or "LoadBalanced HttpSolrClient" is a load balancing wrapper around
  * {@link HttpSolrClient}. This is useful when you
@@ -321,7 +323,7 @@ public class LBHttpSolrClient extends SolrClient {
   public Rsp request(Req req) throws SolrServerException, IOException {
     Rsp rsp = new Rsp();
     Exception ex = null;
-    boolean isUpdate = req.request instanceof IsUpdateRequest;
+    boolean isNonRetryable = req.request instanceof IsUpdateRequest || ADMIN_PATHS.contains(req.request.getPath());
     List<ServerWrapper> skipped = null;
 
     long timeAllowedNano = getTimeAllowedInNanos(req.getRequest());
@@ -352,7 +354,7 @@ public class LBHttpSolrClient extends SolrClient {
         MDC.put("LBHttpSolrClient.url", serverStr);
         HttpSolrClient client = makeSolrClient(serverStr);
 
-        ex = doRequest(client, req, rsp, isUpdate, false, null);
+        ex = doRequest(client, req, rsp, isNonRetryable, false, null);
         if (ex == null) {
           return rsp; // SUCCESS
         }
@@ -368,7 +370,7 @@ public class LBHttpSolrClient extends SolrClient {
           break;
         }
 
-        ex = doRequest(wrapper.client, req, rsp, isUpdate, true, wrapper.getKey());
+        ex = doRequest(wrapper.client, req, rsp, isNonRetryable, true, wrapper.getKey());
         if (ex == null) {
           return rsp; // SUCCESS
         }
@@ -395,7 +397,7 @@ public class LBHttpSolrClient extends SolrClient {
     return e;
   }  
 
-  protected Exception doRequest(HttpSolrClient client, Req req, Rsp rsp, boolean isUpdate,
+  protected Exception doRequest(HttpSolrClient client, Req req, Rsp rsp, boolean isNonRetryable,
       boolean isZombie, String zombieKey) throws SolrServerException, IOException {
     Exception ex = null;
     try {
@@ -407,7 +409,7 @@ public class LBHttpSolrClient extends SolrClient {
     } catch (SolrException e) {
       // we retry on 404 or 403 or 503 or 500
       // unless it's an update - then we only retry on connect exception
-      if (!isUpdate && RETRY_CODES.contains(e.code())) {
+      if (!isNonRetryable && RETRY_CODES.contains(e.code())) {
         ex = (!isZombie) ? addZombie(client, e) : e;
       } else {
         // Server is alive but the request was likely malformed or invalid
@@ -417,22 +419,22 @@ public class LBHttpSolrClient extends SolrClient {
         throw e;
       }
     } catch (SocketException e) {
-      if (!isUpdate || e instanceof ConnectException) {
+      if (!isNonRetryable || e instanceof ConnectException) {
         ex = (!isZombie) ? addZombie(client, e) : e;
       } else {
         throw e;
       }
     } catch (SocketTimeoutException e) {
-      if (!isUpdate) {
+      if (!isNonRetryable) {
         ex = (!isZombie) ? addZombie(client, e) : e;
       } else {
         throw e;
       }
     } catch (SolrServerException e) {
       Throwable rootCause = e.getRootCause();
-      if (!isUpdate && rootCause instanceof IOException) {
+      if (!isNonRetryable && rootCause instanceof IOException) {
         ex = (!isZombie) ? addZombie(client, e) : e;
-      } else if (isUpdate && rootCause instanceof ConnectException) {
+      } else if (isNonRetryable && rootCause instanceof ConnectException) {
         ex = (!isZombie) ? addZombie(client, e) : e;
       } else {
         throw e;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5556a9b4/solr/solrj/src/java/org/apache/solr/common/params/CommonParams.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/common/params/CommonParams.java b/solr/solrj/src/java/org/apache/solr/common/params/CommonParams.java
index 5ccd70f..b830b41 100644
--- a/solr/solrj/src/java/org/apache/solr/common/params/CommonParams.java
+++ b/solr/solrj/src/java/org/apache/solr/common/params/CommonParams.java
@@ -16,7 +16,10 @@
  */
 package org.apache.solr.common.params;
 
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.Locale;
+import java.util.Set;
 
 
 /**
@@ -178,6 +181,13 @@ public interface CommonParams {
   public static final String AUTHC_PATH = "/admin/authentication";
   public static final String ZK_PATH = "/admin/zookeeper";
 
+  public static final Set<String> ADMIN_PATHS = new HashSet<>(Arrays.asList(
+      CORES_HANDLER_PATH,
+      COLLECTIONS_HANDLER_PATH,
+      CONFIGSETS_HANDLER_PATH,
+      AUTHC_PATH,
+      AUTHZ_PATH));
+
   /** valid values for: <code>echoParams</code> */
   public enum EchoParamStyle {
     EXPLICIT,

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5556a9b4/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudSolrClientTest.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudSolrClientTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudSolrClientTest.java
index 4e8a403..a16e38e 100644
--- a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudSolrClientTest.java
+++ b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudSolrClientTest.java
@@ -22,6 +22,7 @@ import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
@@ -38,11 +39,13 @@ import org.apache.lucene.util.TestUtil;
 import org.apache.solr.client.solrj.SolrClient;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.embedded.JettySolrRunner;
 import org.apache.solr.client.solrj.request.AbstractUpdateRequest;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.client.solrj.request.QueryRequest;
 import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.client.solrj.response.RequestStatusState;
 import org.apache.solr.client.solrj.response.UpdateResponse;
 import org.apache.solr.cloud.AbstractDistribZkTestBase;
 import org.apache.solr.cloud.SolrCloudTestCase;
@@ -60,6 +63,9 @@ import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.params.ShardParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
+import org.apache.solr.handler.admin.CollectionsHandler;
+import org.apache.solr.handler.admin.ConfigSetsHandler;
+import org.apache.solr.handler.admin.CoreAdminHandler;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Rule;
@@ -80,10 +86,11 @@ public class CloudSolrClientTest extends SolrCloudTestCase {
   private static final String id = "id";
 
   private static final int TIMEOUT = 30;
+  private static final int NODE_COUNT = 3;
 
   @BeforeClass
   public static void setupCluster() throws Exception {
-    configureCluster(3)
+    configureCluster(NODE_COUNT)
         .addConfig("conf", getFile("solrj").toPath().resolve("solr").resolve("configsets").resolve("streaming").resolve("conf"))
         .configure();
 
@@ -384,6 +391,11 @@ public class CloudSolrClientTest extends SolrCloudTestCase {
 
   private Long getNumRequests(String baseUrl, String collectionName) throws
       SolrServerException, IOException {
+    return getNumRequests(baseUrl, collectionName, "QUERYHANDLER", "standard", false);
+  }
+
+  private Long getNumRequests(String baseUrl, String collectionName, String category, String key, boolean returnNumErrors) throws
+      SolrServerException, IOException {
 
     NamedList<Object> resp;
     try (HttpSolrClient client = getHttpSolrClient(baseUrl + "/"+ collectionName)) {
@@ -392,14 +404,60 @@ public class CloudSolrClientTest extends SolrCloudTestCase {
       ModifiableSolrParams params = new ModifiableSolrParams();
       params.set("qt", "/admin/mbeans");
       params.set("stats", "true");
-      params.set("key", "standard");
-      params.set("cat", "QUERYHANDLER");
+      params.set("key", key);
+      params.set("cat", category);
       // use generic request to avoid extra processing of queries
       QueryRequest req = new QueryRequest(params);
       resp = client.request(req);
     }
-    return (Long) resp.findRecursive("solr-mbeans", "QUERYHANDLER",
-        "standard", "stats", "requests");
+    return (Long) resp.findRecursive("solr-mbeans", category, key, "stats", returnNumErrors ? "errors" : "requests");
+  }
+
+  @Test
+  public void testNonRetryableRequests() throws Exception {
+    try (CloudSolrClient client = getCloudSolrClient(cluster.getZkServer().getZkAddress())) {
+      // important to have one replica on each node
+      RequestStatusState state = CollectionAdminRequest.createCollection("foo", "conf", 1, NODE_COUNT).processAndWait(client, 60);
+      if (state == RequestStatusState.COMPLETED) {
+        AbstractDistribZkTestBase.waitForRecoveriesToFinish("foo", client.getZkStateReader(), true, true, TIMEOUT);
+        client.setDefaultCollection("foo");
+
+        Map<String, String> adminPathToMbean = new HashMap<>(CommonParams.ADMIN_PATHS.size());
+        adminPathToMbean.put(CommonParams.COLLECTIONS_HANDLER_PATH, CollectionsHandler.class.getName());
+        adminPathToMbean.put(CommonParams.CORES_HANDLER_PATH, CoreAdminHandler.class.getName());
+        adminPathToMbean.put(CommonParams.CONFIGSETS_HANDLER_PATH, ConfigSetsHandler.class.getName());
+        // we do not add the authc/authz handlers because they do not currently expose any mbeans
+
+        for (String adminPath : adminPathToMbean.keySet()) {
+          long errorsBefore = 0;
+          for (JettySolrRunner runner : cluster.getJettySolrRunners()) {
+            Long numRequests = getNumRequests(runner.getBaseUrl().toString(), "foo", "QUERYHANDLER", adminPathToMbean.get(adminPath), true);
+            errorsBefore += numRequests;
+            log.info("Found {} requests to {} on {}", numRequests, adminPath, runner.getBaseUrl());
+          }
+
+          ModifiableSolrParams params = new ModifiableSolrParams();
+          params.set("qt", adminPath);
+          params.set("action", "foobar"); // this should cause an error
+          QueryRequest req = new QueryRequest(params);
+          try {
+            NamedList<Object> resp = client.request(req);
+            fail("call to foo for admin path " + adminPath + " should have failed");
+          } catch (Exception e) {
+            // expected
+          }
+          long errorsAfter = 0;
+          for (JettySolrRunner runner : cluster.getJettySolrRunners()) {
+            Long numRequests = getNumRequests(runner.getBaseUrl().toString(), "foo", "QUERYHANDLER", adminPathToMbean.get(adminPath), true);
+            errorsAfter += numRequests;
+            log.info("Found {} requests to {} on {}", numRequests, adminPath, runner.getBaseUrl());
+          }
+          assertEquals(errorsBefore + 1, errorsAfter);
+        }
+      } else {
+        fail("Collection could not be created within 60 seconds");
+      }
+    }
   }
 
   @Test


[2/2] lucene-solr:branch_6x: SOLR-9439: Shard split clean up logic for older failed splits is faulty (cherry picked from commit 7d2f42e)

Posted by sh...@apache.org.
SOLR-9439: Shard split clean up logic for older failed splits is faulty
(cherry picked from commit 7d2f42e)


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/97b62160
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/97b62160
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/97b62160

Branch: refs/heads/branch_6x
Commit: 97b62160e90a262e7b05883d13b8af45d9052705
Parents: 5556a9b
Author: Shalin Shekhar Mangar <sh...@apache.org>
Authored: Sat Aug 27 09:08:53 2016 +0530
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Sat Aug 27 09:10:44 2016 +0530

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  2 +
 .../org/apache/solr/cloud/SplitShardCmd.java    | 62 ++++++++++++++++----
 .../org/apache/solr/core/CoreContainer.java     |  7 ++-
 .../org/apache/solr/util/TestInjection.java     | 20 +++++++
 .../org/apache/solr/cloud/ShardSplitTest.java   | 54 +++++++++++++++++
 5 files changed, 130 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/97b62160/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 325c5dc..0bb23a2 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -45,6 +45,8 @@ Bug Fixes
 
 * SOLR-9445: Admin requests are retried by CloudSolrClient and LBHttpSolrClient on failure. (shalin)
 
+* SOLR-9439: Shard split clean up logic for older failed splits is faulty. (shalin)
+
 Optimizations
 ----------------------
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/97b62160/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
index d7bbf66..4463285 100644
--- a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java
@@ -46,6 +46,7 @@ import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.Utils;
 import org.apache.solr.handler.component.ShardHandler;
+import org.apache.solr.util.TestInjection;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -79,6 +80,7 @@ public class SplitShardCmd implements Cmd {
 
     log.info("Split shard invoked");
     ZkStateReader zkStateReader = ocmh.zkStateReader;
+    zkStateReader.forceUpdateCollection(collectionName);
 
     String splitKey = message.getStr("split.key");
     ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
@@ -197,7 +199,10 @@ public class SplitShardCmd implements Cmd {
         subSlices.add(subSlice);
         String subShardName = collectionName + "_" + subSlice + "_replica1";
         subShardNames.add(subShardName);
+      }
 
+      boolean oldShardsDeleted = false;
+      for (String subSlice : subSlices) {
         Slice oSlice = collection.getSlice(subSlice);
         if (oSlice != null) {
           final Slice.State state = oSlice.getState();
@@ -206,24 +211,33 @@ public class SplitShardCmd implements Cmd {
                 "Sub-shard: " + subSlice + " exists in active state. Aborting split shard.");
           } else if (state == Slice.State.CONSTRUCTION || state == Slice.State.RECOVERY) {
             // delete the shards
-            for (String sub : subSlices) {
-              log.info("Sub-shard: {} already exists therefore requesting its deletion", sub);
-              Map<String, Object> propMap = new HashMap<>();
-              propMap.put(Overseer.QUEUE_OPERATION, "deleteshard");
-              propMap.put(COLLECTION_PROP, collectionName);
-              propMap.put(SHARD_ID_PROP, sub);
-              ZkNodeProps m = new ZkNodeProps(propMap);
-              try {
-                ocmh.commandMap.get(DELETESHARD).call(clusterState, m, new NamedList());
-              } catch (Exception e) {
-                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + sub,
-                    e);
-              }
+            log.info("Sub-shard: {} already exists therefore requesting its deletion", subSlice);
+            Map<String, Object> propMap = new HashMap<>();
+            propMap.put(Overseer.QUEUE_OPERATION, "deleteshard");
+            propMap.put(COLLECTION_PROP, collectionName);
+            propMap.put(SHARD_ID_PROP, subSlice);
+            ZkNodeProps m = new ZkNodeProps(propMap);
+            try {
+              ocmh.commandMap.get(DELETESHARD).call(clusterState, m, new NamedList());
+            } catch (SolrException e) {
+              throwIfNotNonExistentCoreException(subSlice, e);
+            } catch (Exception e) {
+              throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
+                  e);
             }
+
+            oldShardsDeleted = true;
           }
         }
       }
 
+      if (oldShardsDeleted) {
+        // refresh the locally cached cluster state
+        zkStateReader.forceUpdateCollection(collectionName);
+        clusterState = zkStateReader.getClusterState();
+        collection = clusterState.getCollection(collectionName);
+      }
+
       final String asyncId = message.getStr(ASYNC);
       Map<String, String> requestMap = new HashMap<>();
 
@@ -406,6 +420,8 @@ public class SplitShardCmd implements Cmd {
         replicas.add(propMap);
       }
 
+      assert TestInjection.injectSplitFailureBeforeReplicaCreation();
+
       // we must set the slice state into recovery before actually creating the replica cores
       // this ensures that the logic inside Overseer to update sub-shard state to 'active'
       // always gets a chance to execute. See SOLR-7673
@@ -455,4 +471,24 @@ public class SplitShardCmd implements Cmd {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, null, e);
     }
   }
+
+  private void throwIfNotNonExistentCoreException(String subSlice, SolrException e) {
+    Throwable t = e;
+    String cause = null;
+    while (t != null) {
+      if (t instanceof SolrException) {
+        SolrException solrException = (SolrException) t;
+        cause = solrException.getMetadata("cause");
+        if (cause != null && !"NonExistentCore".equals(cause)) {
+          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
+              e);
+        }
+      }
+      t = t.getCause();
+    }
+    if (!"NonExistentCore".equals(cause)) {
+      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
+          e);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/97b62160/solr/core/src/java/org/apache/solr/core/CoreContainer.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index b4442df..7f80b13 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -996,8 +996,11 @@ public class CoreContainer {
     }
 
     CoreDescriptor cd = solrCores.getCoreDescriptor(name);
-    if (cd == null)
-      throw new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]");
+    if (cd == null) {
+      SolrException solrException = new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]");
+      solrException.setMetadata("cause", "NonExistentCore");
+      throw solrException;
+    }
 
     boolean close = solrCores.isLoadedNotPendingClose(name);
     SolrCore core = solrCores.remove(name);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/97b62160/solr/core/src/java/org/apache/solr/util/TestInjection.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/util/TestInjection.java b/solr/core/src/java/org/apache/solr/util/TestInjection.java
index 03de74d..efd80bf 100644
--- a/solr/core/src/java/org/apache/solr/util/TestInjection.java
+++ b/solr/core/src/java/org/apache/solr/util/TestInjection.java
@@ -113,6 +113,8 @@ public class TestInjection {
   public static String randomDelayInCoreCreation = null;
   
   public static int randomDelayMaxInCoreCreationInSec = 10;
+
+  public static String splitFailureBeforeReplicaCreation = null;
   
   private static Set<Timer> timers = Collections.synchronizedSet(new HashSet<Timer>());
 
@@ -124,6 +126,7 @@ public class TestInjection {
     updateLogReplayRandomPause = null;
     updateRandomPause = null;
     randomDelayInCoreCreation = null;
+    splitFailureBeforeReplicaCreation = null;
 
     for (Timer timer : timers) {
       timer.cancel();
@@ -285,6 +288,23 @@ public class TestInjection {
 
     return true;
   }
+
+  public static boolean injectSplitFailureBeforeReplicaCreation() {
+    if (splitFailureBeforeReplicaCreation != null)  {
+      Random rand = random();
+      if (null == rand) return true;
+
+      Pair<Boolean,Integer> pair = parseValue(splitFailureBeforeReplicaCreation);
+      boolean enabled = pair.first();
+      int chanceIn100 = pair.second();
+      if (enabled && rand.nextInt(100) >= (100 - chanceIn100)) {
+        log.info("Injecting failure in creating replica for sub-shard");
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to create replica");
+      }
+    }
+
+    return true;
+  }
   
   private static Pair<Boolean,Integer> parseValue(String raw) {
     Matcher m = ENABLED_PERCENT.matcher(raw);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/97b62160/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java
index 21dc257..389660f 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java
@@ -41,6 +41,7 @@ import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.cloud.ClusterState;
 import org.apache.solr.common.cloud.CompositeIdRouter;
+import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.DocRouter;
 import org.apache.solr.common.cloud.HashBasedRouter;
 import org.apache.solr.common.cloud.Replica;
@@ -50,6 +51,7 @@ import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.CollectionParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.util.Utils;
+import org.apache.solr.util.TestInjection;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -90,6 +92,58 @@ public class ShardSplitTest extends BasicDistributedZkTest {
     //waitForThingsToLevelOut(15);
   }
 
+  /**
+   * Used to test that we can split a shard when a previous split event
+   * left sub-shards in construction or recovery state.
+   *
+   * See SOLR-9439
+   */
+  @Test
+  public void testSplitAfterFailedSplit() throws Exception {
+    waitForThingsToLevelOut(15);
+
+    TestInjection.splitFailureBeforeReplicaCreation = "true:100"; // we definitely want split to fail
+    try {
+      try {
+        CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+        splitShard.setShardName(SHARD1);
+        splitShard.process(cloudClient);
+        fail("Shard split was not supposed to succeed after failure injection!");
+      } catch (Exception e) {
+        // expected
+      }
+
+      // assert that sub-shards cores exist and sub-shard is in construction state
+      ZkStateReader zkStateReader = cloudClient.getZkStateReader();
+      zkStateReader.forceUpdateCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+      ClusterState state = zkStateReader.getClusterState();
+      DocCollection collection = state.getCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+
+      Slice shard10 = collection.getSlice(SHARD1_0);
+      assertEquals(Slice.State.CONSTRUCTION, shard10.getState());
+      assertEquals(1, shard10.getReplicas().size());
+
+      Slice shard11 = collection.getSlice(SHARD1_1);
+      assertEquals(Slice.State.CONSTRUCTION, shard11.getState());
+      assertEquals(1, shard11.getReplicas().size());
+
+      // lets retry the split
+      TestInjection.reset(); // let the split succeed
+      try {
+        CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
+        splitShard.setShardName(SHARD1);
+        splitShard.process(cloudClient);
+        // Yay!
+      } catch (Exception e) {
+        log.error("Shard split failed", e);
+        fail("Shard split did not succeed after a previous failed split attempt left sub-shards in construction state");
+      }
+
+    } finally {
+      TestInjection.reset();
+    }
+  }
+
   @Test
   public void testSplitShardWithRule() throws Exception {
     waitForThingsToLevelOut(15);