You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2016/03/22 17:40:46 UTC

[1/5] lucene-solr:jira/SOLR-445: SOLR-445: remove nocommits related to OOM trapping since SOLR-8539 has concluded that this isn't a thing the java code actually needs to be defensive of

Repository: lucene-solr
Updated Branches:
  refs/heads/jira/SOLR-445 21c0fe690 -> cc2cd23ca


SOLR-445: remove nocommits related to OOM trapping since SOLR-8539 has concluded that this isn't a thing the java code actually needs to be defensive of


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/fe54da0b
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/fe54da0b
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/fe54da0b

Branch: refs/heads/jira/SOLR-445
Commit: fe54da0b58ed18a38f3dd436dd3f30fbee9acbbf
Parents: 21c0fe6
Author: Chris Hostetter <ho...@apache.org>
Authored: Mon Mar 21 10:53:43 2016 -0700
Committer: Chris Hostetter <ho...@apache.org>
Committed: Mon Mar 21 10:54:19 2016 -0700

----------------------------------------------------------------------
 .../solr/update/processor/TolerantUpdateProcessor.java  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fe54da0b/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessor.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessor.java
index 9f9ff5e..79573c9 100644
--- a/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessor.java
+++ b/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessor.java
@@ -156,7 +156,7 @@ public class TolerantUpdateProcessor extends UpdateRequestProcessor {
       
       super.processAdd(cmd);
 
-    } catch (Throwable t) { // nocommit: OOM trap
+    } catch (Throwable t) { 
       firstErrTracker.caught(t);
       
       if (isLeader || distribPhase.equals(DistribPhase.NONE)) {
@@ -188,7 +188,7 @@ public class TolerantUpdateProcessor extends UpdateRequestProcessor {
       
       super.processDelete(cmd);
       
-    } catch (Throwable t) { // nocommit: OOM trap
+    } catch (Throwable t) {
       firstErrTracker.caught(t);
       
       ToleratedUpdateError err = new ToleratedUpdateError(cmd.isDeleteById() ? CmdType.DELID : CmdType.DELQ,
@@ -214,7 +214,7 @@ public class TolerantUpdateProcessor extends UpdateRequestProcessor {
   public void processMergeIndexes(MergeIndexesCommand cmd) throws IOException {
     try {
       super.processMergeIndexes(cmd);
-    } catch (Throwable t) { // nocommit: OOM trap
+    } catch (Throwable t) {
       // we're not tolerante of errors from this type of command, but we
       // do need to track it so we can annotate it with any other errors we were allready tolerant of
       firstErrTracker.caught(t);
@@ -226,7 +226,7 @@ public class TolerantUpdateProcessor extends UpdateRequestProcessor {
   public void processCommit(CommitUpdateCommand cmd) throws IOException {
     try {
       super.processCommit(cmd);
-    } catch (Throwable t) { // nocommit: OOM trap
+    } catch (Throwable t) {
       // we're not tolerante of errors from this type of command, but we
       // do need to track it so we can annotate it with any other errors we were allready tolerant of
       firstErrTracker.caught(t);
@@ -238,7 +238,7 @@ public class TolerantUpdateProcessor extends UpdateRequestProcessor {
   public void processRollback(RollbackUpdateCommand cmd) throws IOException {
     try {
       super.processRollback(cmd);
-    } catch (Throwable t) { // nocommit: OOM trap
+    } catch (Throwable t) {
       // we're not tolerante of errors from this type of command, but we
       // do need to track it so we can annotate it with any other errors we were allready tolerant of
       firstErrTracker.caught(t);
@@ -370,7 +370,7 @@ public class TolerantUpdateProcessor extends UpdateRequestProcessor {
      * even if you are going to ignore it (for now).  If you plan to rethrow the Exception, use 
      * {@link #throwFirst} instead.
      */
-    public void caught(Throwable t) {    // nocommit: switch to just Exception?
+    public void caught(Throwable t) {
       assert null != t;
       if (null == first) {
         if (t instanceof SolrException) {


[5/5] lucene-solr:jira/SOLR-445: SOLR-445: cloud test & bug fix for docs missing their uniqueKey field

Posted by ho...@apache.org.
SOLR-445: cloud test & bug fix for docs missing their uniqueKey field


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/cc2cd23c
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/cc2cd23c
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/cc2cd23c

Branch: refs/heads/jira/SOLR-445
Commit: cc2cd23ca2537324dc7e4afe6a29605bbf9f1cb8
Parents: b6be74f
Author: Chris Hostetter <ho...@apache.org>
Authored: Tue Mar 22 09:25:33 2016 -0700
Committer: Chris Hostetter <ho...@apache.org>
Committed: Tue Mar 22 09:25:33 2016 -0700

----------------------------------------------------------------------
 .../processor/TolerantUpdateProcessor.java      |  4 +-
 .../cloud/TestTolerantUpdateProcessorCloud.java | 91 ++++++++++++++++++++
 2 files changed, 94 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc2cd23c/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessor.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessor.java
index 79573c9..316a8d0 100644
--- a/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessor.java
+++ b/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessor.java
@@ -147,12 +147,14 @@ public class TolerantUpdateProcessor extends UpdateRequestProcessor {
   
   @Override
   public void processAdd(AddUpdateCommand cmd) throws IOException {
-    boolean isLeader = isLeader(cmd); // nocommit: is this needed? see below...
+    boolean isLeader = true; // set below during 'try'   // nocommit: is this var really needed (see below)
     BytesRef id = null;
     
     try {
       // force AddUpdateCommand to validate+cache the id before proceeding
       id = cmd.getIndexedId();
+      // if the id is missing from doc, act like we're the leader, let downstream throw error
+      isLeader = (null == id) || isLeader(cmd); // nocommit: is this needed? see below...
       
       super.processAdd(cmd);
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc2cd23c/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
index 48c81de..236213e 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
@@ -493,6 +493,38 @@ public class TestTolerantUpdateProcessorCloud extends SolrCloudTestCase {
 
     // clean slate
     assertEquals(0, client.deleteByQuery("*:*").getStatus());
+
+    // many docs from diff shards, 1 from each shard should fail and 1 w/o uniqueKey
+    
+    rsp = update(params("update.chain", "tolerant-chain-max-errors-10",
+                        "commit", "true"),
+                 doc(f("id", S_ONE_PRE + "11")),
+                 doc(f("id", S_TWO_PRE + "21")),
+                 doc(f("id", S_ONE_PRE + "12")),
+                 doc(f("id", S_TWO_PRE + "22"), f("foo_i", "bogus_val")),
+                 doc(f("id", S_ONE_PRE + "13")),
+                 doc(f("id", S_TWO_PRE + "23")),
+                 doc(f("foo_i", "42")),          // no "id"
+                 doc(f("id", S_ONE_PRE + "14")),
+                 doc(f("id", S_TWO_PRE + "24")),
+                 doc(f("id", S_ONE_PRE + "15"), f("foo_i", "bogus_val")),
+                 doc(f("id", S_TWO_PRE + "25")),
+                 doc(f("id", S_ONE_PRE + "16")),
+                 doc(f("id", S_TWO_PRE + "26"))).process(client);
+    
+    assertEquals(0, rsp.getStatus());
+    assertUpdateTolerantAddErrors("many docs, 1 from each shard (+ no id) should fail", rsp,
+                                  S_ONE_PRE + "15",
+                                  "(unknown)",
+                                  S_TWO_PRE + "22");
+    assertQueryDocIds(client, false, S_TWO_PRE + "22", S_ONE_PRE + "15");
+    assertQueryDocIds(client, true,
+                      S_ONE_PRE + "11", S_TWO_PRE + "21", S_ONE_PRE + "12",
+                      S_ONE_PRE + "13", S_TWO_PRE + "23", S_ONE_PRE + "14", S_TWO_PRE + "24",
+                      S_TWO_PRE + "25", S_ONE_PRE + "16", S_TWO_PRE + "26");
+
+    // clean slate
+    assertEquals(0, client.deleteByQuery("*:*").getStatus());
     
     // many docs from diff shards, more then 10 (total) should fail
 
@@ -652,6 +684,65 @@ public class TestTolerantUpdateProcessorCloud extends SolrCloudTestCase {
                       // , S_ONE_PRE + "x", S_TWO_PRE + "x", // skipped
                       );
 
+    // clean slate
+    assertEquals(0, client.deleteByQuery("*:*").getStatus());
+    
+    // many docs from diff shards, more then 10 don't have any uniqueKey specified
+
+    try {
+      ArrayList<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(30);
+      docs.add(doc(f("id", S_ONE_PRE + "z")));
+      docs.add(doc(f("id", S_TWO_PRE + "z")));
+      docs.add(doc(f("id", S_ONE_PRE + "y")));
+      docs.add(doc(f("id", S_TWO_PRE + "y")));
+      for (int i = 0; i < 11; i++) {
+        // no "id" field
+        docs.add(doc(f("foo_i", "" + i)));
+      }
+      docs.add(doc(f("id", S_ONE_PRE + "x"))); // may be skipped, more then 10 fails
+      docs.add(doc(f("id", S_TWO_PRE + "x"))); // may be skipped, more then 10 fails
+          
+      rsp = update(params("update.chain", "tolerant-chain-max-errors-10",
+                          "commit", "true"),
+                   docs.toArray(new SolrInputDocument[docs.size()])).process(client);
+      
+      fail("did not get a top level exception when more then 10 docs mising uniqueKey: " + rsp.toString());
+    } catch (SolrException e) {
+      // we can't make any reliable assertions about the error message, because
+      // it varies based on how the request was routed -- see SOLR-8830
+      assertEquals("not the type of error we were expecting ("+e.code()+"): " + e.toString(),
+                   // NOTE: we always expect a 400 because we know that's what we would get from these types of errors
+                   // on a single node setup -- a 5xx type error isn't something we should have triggered
+                   400, e.code());
+
+      // verify that the Exceptions metadata can tell us what failed.
+      NamedList<String> remoteErrMetadata = e.getMetadata();
+      assertNotNull("no metadata in: " + e.toString(), remoteErrMetadata);
+      int actualKnownErrsCount = 0;
+      for (int i = 0; i < remoteErrMetadata.size(); i++) {
+        ToleratedUpdateError err =
+          ToleratedUpdateError.parseMetadataIfToleratedUpdateError(remoteErrMetadata.getName(i),
+                                                                   remoteErrMetadata.getVal(i));
+        if (null == err) {
+          // some metadata unrelated to this update processor
+          continue;
+        }
+        actualKnownErrsCount++;
+        assertEquals("only expected type of error is ADD: " + err,
+                     CmdType.ADD, err.getType());
+        assertTrue("failed id didn't match 'unknown': " + err,
+                   err.getId().contains("unknown"));
+      }
+      assertEquals("wrong number of errors in metadata: " + remoteErrMetadata.toString(),
+                   11, actualKnownErrsCount);
+    }
+    assertEquals(0, client.commit().getStatus()); // need to force since update didn't finish
+    assertQueryDocIds(client, true
+                      , S_ONE_PRE + "z", S_ONE_PRE + "y", S_TWO_PRE + "z", S_TWO_PRE + "y" // first
+                      // // we can't assert for sure these docs were skipped or added
+                      // // depending on shard we hit, they may have been added async before errors were exceeded
+                      // , S_ONE_PRE + "x", S_TWO_PRE + "x" // skipped
+                      );
   }
 
   //


[3/5] lucene-solr:jira/SOLR-445: SOLR-8881: replace nocommits with doc note and link to jira

Posted by ho...@apache.org.
SOLR-8881: replace nocommits with doc note and link to jira


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c740e696
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c740e696
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c740e696

Branch: refs/heads/jira/SOLR-445
Commit: c740e69622f3c0295498f02e76e42af6341ba333
Parents: 5d93384
Author: Chris Hostetter <ho...@apache.org>
Authored: Mon Mar 21 16:36:58 2016 -0700
Committer: Chris Hostetter <ho...@apache.org>
Committed: Mon Mar 21 16:36:58 2016 -0700

----------------------------------------------------------------------
 .../update/processor/TolerantUpdateProcessorFactory.java  |  7 +++++++
 .../solr/cloud/TestTolerantUpdateProcessorCloud.java      | 10 ----------
 2 files changed, 7 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c740e696/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessorFactory.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessorFactory.java
index 35ca63b..e7f5653 100644
--- a/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessorFactory.java
+++ b/solr/core/src/java/org/apache/solr/update/processor/TolerantUpdateProcessorFactory.java
@@ -69,6 +69,13 @@ import static org.apache.solr.update.processor.DistributingUpdateProcessorFactor
  * curl http://localhost:8983/update?update.chain=tolerant-chain&amp;maxErrors=100 -H "Content-Type: text/xml" -d @myfile.xml
  * </pre>
  * 
+ * <p>
+ * <b>NOTE:</b> The behavior of this UpdateProcessofFactory in conjunction with indexing operations 
+ * while a Shard Split is actively in progress is not well defined (or sufficiently tested).  Users 
+ * of this update processor are encouraged to either disable it, or pause updates, while any shard 
+ * splitting is in progress (see <a href="https://issues.apache.org/jira/browse/SOLR-8881">SOLR-8881</a> 
+ * for more details.)
+ * </p>
  */
 public class TolerantUpdateProcessorFactory extends UpdateRequestProcessorFactory
   implements SolrCoreAware, UpdateRequestProcessorFactory.RunAlways {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c740e696/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
index 41ff4af..3c105c4 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
@@ -69,16 +69,6 @@ import org.slf4j.LoggerFactory;
  * and assumes that the state of the cluster is healthy.
  * </p>
  *
- *
- * nocommit: what about shard splitting and "sub shard leaders" ? ...
- * (no idea if/how that affects things, but i notice lots of logic in DistributedUpdateProcessor along 
- * the lines of "if (isLeader || isSubShardLeader)" and "if (!isLeader) { if (subShardLeader) {..." 
- * which makes me worry that we may need explict testing of "tolerant" behavior when updates are routed 
- * to subshards and then fail?
- *
- * nocommit: once these tests are passing reliably, we should also have a fully randomized sibling test...
- * - randomized # nodes, shards, replicas
- * - random updates contain rand # of docs with rand # failures to a random client
  */
 public class TestTolerantUpdateProcessorCloud extends SolrCloudTestCase {
 


[2/5] lucene-solr:jira/SOLR-445: SOLR-445: fix exception msg when CloudSolrClient does async updates that (cumulatively) exceed maxErrors

Posted by ho...@apache.org.
SOLR-445: fix exception msg when CloudSolrClient does async updates that (cumulatively) exceed maxErrors

I initially thought it would make sense to refactor DistributedUpdatesAsyncException into solr-common and re-use it here, but when i started down that path i realized it didn't make any sense since there aren't actual exceptions to wrap client side.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/5d93384e
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/5d93384e
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/5d93384e

Branch: refs/heads/jira/SOLR-445
Commit: 5d93384e724b6f611270e212a4f9bd5b00c38e85
Parents: fe54da0
Author: Chris Hostetter <ho...@apache.org>
Authored: Mon Mar 21 14:36:12 2016 -0700
Committer: Chris Hostetter <ho...@apache.org>
Committed: Mon Mar 21 14:36:12 2016 -0700

----------------------------------------------------------------------
 .../solr/client/solrj/impl/CloudSolrClient.java       | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5d93384e/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
index 37cee8e..edfe1c3 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
@@ -781,14 +781,22 @@ public class CloudSolrClient extends SolrClient {
       if (maxToleratedErrors < toleratedErrors.size()) {
         // cumulative errors are too high, we need to throw a client exception w/correct metadata
 
-        // nocommit: refactor & reuse DistributedUpdatesAsyncException
+        // NOTE: it shouldn't be possible for 1 == toleratedErrors.size(), because if that were the case
+        // then at least one shard should have thrown a real error before this, so we don't worry
+        // about having a more "singular" exception msg for that situation
+        StringBuilder msgBuf =  new StringBuilder()
+          .append(toleratedErrors.size()).append(" Async failures during distributed update: ");
+          
         NamedList metadata = new NamedList<String>();
-        SolrException toThrow = new SolrException(ErrorCode.BAD_REQUEST, "nocommit: better msg from DUAE");
-        toThrow.setMetadata(metadata);
         for (SimpleOrderedMap<String> err : toleratedErrors) {
           ToleratedUpdateError te = ToleratedUpdateError.parseMap(err);
           metadata.add(te.getMetadataKey(), te.getMetadataValue());
+          
+          msgBuf.append("\n").append(te.getMessage());
         }
+        
+        SolrException toThrow = new SolrException(ErrorCode.BAD_REQUEST, msgBuf.toString());
+        toThrow.setMetadata(metadata);
         throw toThrow;
       }
     }


[4/5] lucene-solr:jira/SOLR-445: SOLR-8862 work around. Maybe something like this should be promoted into MiniSolrCloudCluster's start() method? or SolrCloudTestCase's configureCluster?

Posted by ho...@apache.org.
SOLR-8862 work around.  Maybe something like this should be promoted into MiniSolrCloudCluster's start() method? or SolrCloudTestCase's configureCluster?


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/b6be74f2
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/b6be74f2
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/b6be74f2

Branch: refs/heads/jira/SOLR-445
Commit: b6be74f2182c46a10f861556ea81d3ed1a79a308
Parents: c740e69
Author: Chris Hostetter <ho...@apache.org>
Authored: Mon Mar 21 18:34:12 2016 -0700
Committer: Chris Hostetter <ho...@apache.org>
Committed: Mon Mar 21 18:34:12 2016 -0700

----------------------------------------------------------------------
 .../cloud/TestTolerantUpdateProcessorCloud.java | 35 ++++++++++++++++++++
 .../TestTolerantUpdateProcessorRandomCloud.java |  4 +--
 2 files changed, 37 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b6be74f2/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
index 3c105c4..48c81de 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java
@@ -113,6 +113,7 @@ public class TestTolerantUpdateProcessorCloud extends SolrCloudTestCase {
     configureCluster(NUM_SERVERS)
       .addConfig(configName, configDir.toPath())
       .configure();
+    assertSpinLoopAllJettyAreRunning(cluster);
     
     Map<String, String> collectionProperties = new HashMap<>();
     collectionProperties.put("config", "solrconfig-distrib-update-processor-chains.xml");
@@ -764,6 +765,40 @@ public class TestTolerantUpdateProcessorCloud extends SolrCloudTestCase {
     
   }
 
+  /**
+   * HACK: Loops over every Jetty instance in the specified MiniSolrCloudCluster to see if they are running,
+   * and sleeps small increments until they all report that they are, or a max num iters is reached
+   * 
+   * (work around for SOLR-8862.  Maybe something like this should be promoted into MiniSolrCloudCluster's 
+   * start() method? or SolrCloudTestCase's configureCluster?)
+   */
+  public static void assertSpinLoopAllJettyAreRunning(MiniSolrCloudCluster cluster) throws InterruptedException {
+    // NOTE: idealy we could use an ExecutorService that tried to open Sockets (with a long timeout)
+    // to each of the jetty instances in parallel w/o any sleeping -- but since they pick their ports
+    // dynamically and don't report them until/unless the server is up, that won't neccessarily do us
+    // any good.
+    final int numServers = cluster.getJettySolrRunners().size();
+    int numRunning = 0;
+    for (int i = 5; 0 <= i; i--) {
+      numRunning = 0;
+      for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
+        if (jetty.isRunning()) {
+          numRunning++;
+        }
+      }
+      if (numServers == numRunning) {
+        return;
+      } else if (0 == i) {
+        // give up
+        break;
+      }
+      // the more nodes we're waiting on, the longer we should try to sleep (within reason)
+      Thread.sleep(Math.min((numServers - numRunning) * 100, 1000));
+    }
+    assertEquals("giving up waiting for all jetty instances to be running",
+                 numServers, numRunning);
+  }
+  
   /** Asserts that the UpdateResponse contains the specified expectedErrs and no others */
   public static void assertUpdateTolerantErrors(String assertionMsgPrefix,
                                                 UpdateResponse response,

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b6be74f2/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java
index 536bb89..6e5daeb 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorRandomCloud.java
@@ -104,8 +104,8 @@ public class TestTolerantUpdateProcessorRandomCloud extends SolrCloudTestCase {
     configureCluster(numServers)
       .addConfig(configName, configDir.toPath())
       .configure();
-    
-    Thread.sleep(2000); // anoying attempt to work arround SOLR-8862 // nocommit ? ? ? 
+
+    TestTolerantUpdateProcessorCloud.assertSpinLoopAllJettyAreRunning(cluster);
     
     Map<String, String> collectionProperties = new HashMap<>();
     collectionProperties.put("config", "solrconfig-distrib-update-processor-chains.xml");