You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2018/08/14 18:41:56 UTC

[1/2] lucene-solr:master: SOLR-12470: Search Rate Trigger multiple bug fixes, improvements and documentation updates.

Repository: lucene-solr
Updated Branches:
  refs/heads/master 7ecf9b63b -> 8dd704ef7


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestTriggerIntegration.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestTriggerIntegration.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestTriggerIntegration.java
index 0a28a44..1340bac 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestTriggerIntegration.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestTriggerIntegration.java
@@ -28,6 +28,7 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.locks.ReentrantLock;
 
@@ -56,6 +57,7 @@ import org.apache.solr.cloud.autoscaling.CapturedEvent;
 import org.apache.solr.cloud.autoscaling.TriggerValidationException;
 import org.apache.solr.common.cloud.LiveNodesListener;
 import org.apache.solr.common.cloud.ZkStateReader;
+import org.apache.solr.common.params.CollectionAdminParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.TimeSource;
 import org.apache.solr.core.SolrResourceLoader;
@@ -86,6 +88,10 @@ public class TestTriggerIntegration extends SimSolrCloudTestCase {
   private static CountDownLatch actionStarted;
   private static CountDownLatch actionInterrupted;
   private static CountDownLatch actionCompleted;
+  private static CountDownLatch triggerStartedLatch;
+  private static CountDownLatch triggerFinishedLatch;
+  private static AtomicInteger triggerStartedCount;
+  private static AtomicInteger triggerFinishedCount;
   private static AtomicBoolean triggerFired;
   private static Set<TriggerEvent> events = ConcurrentHashMap.newKeySet();
 
@@ -131,6 +137,10 @@ public class TestTriggerIntegration extends SimSolrCloudTestCase {
     actionStarted = new CountDownLatch(1);
     actionInterrupted = new CountDownLatch(1);
     actionCompleted = new CountDownLatch(1);
+    triggerStartedLatch = new CountDownLatch(1);
+    triggerFinishedLatch = new CountDownLatch(1);
+    triggerStartedCount = new AtomicInteger();
+    triggerFinishedCount = new AtomicInteger();
     events.clear();
     listenerEvents.clear();
     while (cluster.getClusterStateProvider().getLiveNodes().size() < 2) {
@@ -138,6 +148,12 @@ public class TestTriggerIntegration extends SimSolrCloudTestCase {
       // lets start a node
       cluster.simAddNode();
     }
+    // do this in advance if missing
+    if (!cluster.getSimClusterStateProvider().simListCollections().contains(CollectionAdminParams.SYSTEM_COLL)) {
+      cluster.getSimClusterStateProvider().createSystemCollection();
+      CloudTestUtils.waitForState(cluster, CollectionAdminParams.SYSTEM_COLL, 120, TimeUnit.SECONDS,
+          CloudTestUtils.clusterShape(1, 1, false, true));
+    }
   }
 
   @Test
@@ -1152,15 +1168,33 @@ public class TestTriggerIntegration extends SimSolrCloudTestCase {
     }
   }
 
+  public static class FinishTriggerAction extends TriggerActionBase {
+    @Override
+    public void process(TriggerEvent event, ActionContext context) throws Exception {
+      triggerFinishedCount.incrementAndGet();
+      triggerFinishedLatch.countDown();
+    }
+  }
+
+  public static class StartTriggerAction extends TriggerActionBase {
+    @Override
+    public void process(TriggerEvent event, ActionContext context) throws Exception {
+      triggerStartedLatch.countDown();
+      triggerStartedCount.incrementAndGet();
+    }
+  }
+
+
+
   @Test
-  @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028")
+  //@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028")
   public void testSearchRate() throws Exception {
     SolrClient solrClient = cluster.simGetSolrClient();
     String COLL1 = "collection1";
     CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(COLL1,
         "conf", 1, 2);
     create.process(solrClient);
-    CloudTestUtils.waitForState(cluster, COLL1, 10, TimeUnit.SECONDS, CloudTestUtils.clusterShape(1, 2));
+    CloudTestUtils.waitForState(cluster, COLL1, 10, TimeUnit.SECONDS, CloudTestUtils.clusterShape(1, 2, false, true));
 
     String setTriggerCommand = "{" +
         "'set-trigger' : {" +
@@ -1169,10 +1203,13 @@ public class TestTriggerIntegration extends SimSolrCloudTestCase {
         "'waitFor' : '" + waitForSeconds + "s'," +
         "'enabled' : true," +
         "'aboveRate' : 1.0," +
+        "'aboveNodeRate' : 1.0," +
         "'actions' : [" +
-        "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}" +
-        "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" +
+        "{'name':'start','class':'" + StartTriggerAction.class.getName() + "'}," +
+        "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
+        "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
         "{'name':'test','class':'" + TestSearchRateAction.class.getName() + "'}" +
+        "{'name':'finish','class':'" + FinishTriggerAction.class.getName() + "'}," +
         "]" +
         "}}";
     SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand);
@@ -1200,8 +1237,10 @@ public class TestTriggerIntegration extends SimSolrCloudTestCase {
 
     cluster.getSimClusterStateProvider().simSetCollectionValue(COLL1, "QUERY./select.requestTimes:1minRate", 500, false, true);
 
-    boolean await = triggerFiredLatch.await(20000 / SPEED, TimeUnit.MILLISECONDS);
-    assertTrue("The trigger did not fire at all", await);
+    boolean await = triggerStartedLatch.await(20000 / SPEED, TimeUnit.MILLISECONDS);
+    assertTrue("The trigger did not start in time", await);
+    await = triggerFinishedLatch.await(60000 / SPEED, TimeUnit.MILLISECONDS);
+    assertTrue("The trigger did not finish in time", await);
     // wait for listener to capture the SUCCEEDED stage
     cluster.getTimeSource().sleep(5000);
     List<CapturedEvent> events = listenerEvents.get("srt");

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/solr-ref-guide/src/solrcloud-autoscaling-triggers.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/solrcloud-autoscaling-triggers.adoc b/solr/solr-ref-guide/src/solrcloud-autoscaling-triggers.adoc
index 0e77c11..3f6135c 100644
--- a/solr/solr-ref-guide/src/solrcloud-autoscaling-triggers.adoc
+++ b/solr/solr-ref-guide/src/solrcloud-autoscaling-triggers.adoc
@@ -194,14 +194,31 @@ Similarly, if the search rate falls below a threshold then the trigger may reque
 replicas are deleted from "cold" shards. It can also optionally issue node-level action requests
 when a cumulative node-level rate falls below a threshold.
 
+Per-shard rates are calculated as arithmetic average of rates of all searchable replicas in a given shard.
+This method was chosen to avoid generating false events when a simple client keeps sending requests
+to a single specific replica (because adding or removing other replicas can't solve this situation,
+only proper load balancing can - either by using `CloudSolrClient` or another load-balancing client).
+
 Note: this trigger calculates node-level cumulative rates using per-replica rates reported by
-replicas that are part of monitored collections / shards. This means that it may report
+replicas that are part of monitored collections / shards on each node. This means that it may report
 some nodes as "cold" (underutilized) because it ignores other, perhaps more active, replicas
 belonging to other collections. Also, nodes that don't host any of the monitored replicas or
 those that are explicitly excluded by `node` config property won't be reported at all.
 
-
-This trigger supports the following configuration:
+Note 2: special care should be taken when configuring `waitFor` property. By default the trigger
+monitors a 1-min average search rate of a replica. Changes to the number of replicas that should in turn
+change per-replica search rates may be requested and executed relatively quickly if the
+`waitFor` is set to comparable values of 1 min or shorter. However, the metric value, being a
+moving average, will always lag behind the new "momentary" rate after the changes. This in turn means that
+the monitored metric may not change sufficiently enough to prevent the
+trigger from firing again (because it will continue to measure the average rate as still violating
+the threshold for some time after the change was executed). As a result the trigger may keep
+requesting that even more replicas be added (or removed) and thus it may "overshoot" the optimal number of replicas.
+For this reason it's recommended to always set `waitFor` to values several
+times longer than the time constant of the used metric. For example, with the default 1-min average the
+`waitFor` should be set to at least `2m` or more.
+
+This trigger supports the following configuration properties:
 
 `collections`:: (string, optional) comma-separated list of collection names to monitor, or any collection if empty / not set.
 
@@ -211,7 +228,9 @@ This trigger supports the following configuration:
 
 `metric`:: (string, optional) metric name that represents the search rate
 (default is `QUERY./select.requestTimes:1minRate`). This name has to identify a single numeric
-metric value, and it may use the colon syntax for selecting one property of a complex metric.
+metric value, and it may use the colon syntax for selecting one property of a complex metric. This value
+is collected from all replicas for a shard, and then an arithmetic average is calculated per shard
+to determine shard-level violations.
 
 `maxOps`:: (integer, optional) maximum number of add replica / delete replica operations
 requested in a single autoscaling event. The default value is 3 and it helps to smooth out
@@ -229,23 +248,30 @@ the value is set to 1. Note also that shard leaders are never deleted.
 `belowRate`:: (float) the lower bound for the request rate metric value. At least one of
 `aboveRate` or `belowRate` must be set.
 
-`aboveOp`:: (string, optional) collection action to request when the upper threshold for a shard or replica is
+`aboveNodeRate`:: (float) the upper bound for the total request rate metric value per node. If not
+set then cumulative per-node rates will be ignored.
+
+`belowNodeRate`:: (float) the lower bound for the total request rate metric value per node. If not
+set then cumulative per-node rates will be ignored.
+
+`aboveOp`:: (string, optional) collection action to request when the upper threshold for a shard is
 exceeded. Default action is `ADDREPLICA` and the trigger will request from 1 up to `maxOps` operations
 per shard per event, proportionally to how much the rate is exceeded. This property can be set to 'NONE'
 to effectively disable the action but still report it to the listeners.
 
-`aboveNodeOp`:: (string, optional) collection action to request when the upper threshold for a node is exceeded.
+`aboveNodeOp`:: (string, optional) collection action to request when the upper threshold for a node (`aboveNodeRate`) is exceeded.
 Default action is `MOVEREPLICA`, and the trigger will request 1 replica operation per hot node per event.
-If both `aboveOp` and `aboveNodeOp` operations are requested then `aboveNodeOp` operations are
-always requested first. This property can be set to 'NONE' to effectively disable the action but still
-report it to the listeners.
+If both `aboveOp` and `aboveNodeOp` operations are to be requested then `aboveNodeOp` operations are
+always requested first, and only if no `aboveOp` (shard level) operations are to be requested (because `aboveOp`
+operations will change node-level rates anyway). This property can be set to 'NONE' to effectively disable
+the action but still report it to the listeners.
 
-`belowOp`:: (string, optional) collection action to request when the lower threshold for a shard or replica is
+`belowOp`:: (string, optional) collection action to request when the lower threshold for a shard is
 exceeded. Default action is `DELETEREPLICA`, and the trigger will request at most `maxOps` replicas
 to be deleted from eligible cold shards. This property can be set to 'NONE'
 to effectively disable the action but still report it to the listeners.
 
-`belowNodeOp`:: action to request when the lower threshold for a node is exceeded.
+`belowNodeOp`:: action to request when the lower threshold for a node (`belowNodeRate`) is exceeded.
 Default action is null (not set) and the condition is ignored, because in many cases the
 trigger will monitor only some selected resources (replicas from selected
 collections / shards) so setting this by default to e.g., `DELETENODE` could interfere with
@@ -269,6 +295,7 @@ request node deletion.
   "metric" : "QUERY./select.requestTimes:5minRate",
   "aboveRate" : 100.0,
   "belowRate" : 0.01,
+  "belowNodeRate" : 0.01,
   "belowNodeOp" : "DELETENODE",
   "minReplicas" : 1,
   "waitFor" : "20m",


[2/2] lucene-solr:master: SOLR-12470: Search Rate Trigger multiple bug fixes, improvements and documentation updates.

Posted by ab...@apache.org.
SOLR-12470: Search Rate Trigger multiple bug fixes, improvements and documentation updates.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/8dd704ef
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/8dd704ef
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/8dd704ef

Branch: refs/heads/master
Commit: 8dd704ef78e5a2c3bf7c3d206d44a2d971c758fe
Parents: 7ecf9b6
Author: Andrzej Bialecki <ab...@apache.org>
Authored: Tue Aug 14 20:41:16 2018 +0200
Committer: Andrzej Bialecki <ab...@apache.org>
Committed: Tue Aug 14 20:41:42 2018 +0200

----------------------------------------------------------------------
 solr/CHANGES.txt                                |   2 +
 .../org/apache/solr/cloud/ActionThrottle.java   |   2 +-
 .../cloud/autoscaling/IndexSizeTrigger.java     |   3 +
 .../cloud/autoscaling/ScheduledTriggers.java    |   7 +-
 .../cloud/autoscaling/SearchRateTrigger.java    | 173 ++++++++++++++-----
 .../autoscaling/TriggerActionException.java     |  33 ++++
 .../apache/solr/cloud/ActionThrottleTest.java   |   2 +-
 .../org/apache/solr/cloud/CloudTestUtils.java   |  29 +++-
 .../cloud/autoscaling/IndexSizeTriggerTest.java |  25 +--
 .../ScheduledMaintenanceTriggerTest.java        |   2 +-
 .../SearchRateTriggerIntegrationTest.java       |  41 +++--
 .../autoscaling/SearchRateTriggerTest.java      |  11 +-
 .../sim/SimClusterStateProvider.java            | 158 +++++++++++------
 .../autoscaling/sim/SimNodeStateProvider.java   |  14 +-
 .../autoscaling/sim/TestComputePlanAction.java  |   4 +-
 .../autoscaling/sim/TestExecutePlanAction.java  |  10 +-
 .../cloud/autoscaling/sim/TestLargeCluster.java |  89 +++++++---
 .../cloud/autoscaling/sim/TestPolicyCloud.java  |  10 +-
 .../autoscaling/sim/TestTriggerIntegration.java |  51 +++++-
 .../src/solrcloud-autoscaling-triggers.adoc     |  49 ++++--
 20 files changed, 519 insertions(+), 196 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 08499e6..d165e21 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -233,6 +233,8 @@ Bug Fixes
 
 * SOLR-12541: Metrics handler throws an error if there are transient cores. (ab)
 
+* SOLR-12470: Search Rate Trigger multiple bug fixes, improvements and documentation updates. (ab)
+
 Optimizations
 ----------------------
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java b/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java
index f260c8c..1724b53 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java
@@ -82,7 +82,7 @@ public class ActionThrottle {
     if (sleep > 0) {
       log.info("Throttling {} attempts - waiting for {}ms", name, sleep);
       try {
-        Thread.sleep(sleep);
+        timeSource.sleep(sleep);
       } catch (InterruptedException e) {
         Thread.currentThread().interrupt();
       }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java
index f729864..3f2ea8a 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java
@@ -407,6 +407,9 @@ public class IndexSizeTrigger extends TriggerBase {
         replicas.forEach(r -> lastAboveEventMap.put(r.getCore(), now));
       });
       belowSize.forEach((coll, replicas) -> {
+        if (replicas.size() < 2) {
+          return;
+        }
         lastBelowEventMap.put(replicas.get(0).getCore(), now);
         lastBelowEventMap.put(replicas.get(1).getCore(), now);
       });

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
index c125209..7c3cbb0 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
@@ -308,7 +308,7 @@ public class ScheduledTriggers implements Closeable {
                   action.process(event, actionContext);
                 } catch (Exception e) {
                   triggerListeners1.fireListeners(event.getSource(), event, TriggerEventProcessorStage.FAILED, action.getName(), actionContext, e, null);
-                  throw new Exception("Error executing action: " + action.getName() + " for trigger event: " + event, e);
+                  throw new TriggerActionException(event.getSource(), action.getName(), "Error processing action for trigger event: " + event, e);
                 }
                 List<String> afterActions = (List<String>) actionContext.getProperties().computeIfAbsent(TriggerEventProcessorStage.AFTER_ACTION.toString(), k -> new ArrayList<String>());
                 afterActions.add(action.getName());
@@ -319,8 +319,11 @@ public class ScheduledTriggers implements Closeable {
                 assert ev.getId().equals(event.getId());
               }
               triggerListeners1.fireListeners(event.getSource(), event, TriggerEventProcessorStage.SUCCEEDED);
-            } catch (Exception e) {
+            } catch (TriggerActionException e) {
               log.warn("Exception executing actions", e);
+            } catch (Exception e) {
+              triggerListeners1.fireListeners(event.getSource(), event, TriggerEventProcessorStage.FAILED);
+              log.warn("Unhandled exception executing actions", e);
             } finally {
               cooldownStart.set(cloudManager.getTimeSource().getTimeNs());
               hasPendingActions.set(false);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
index 1824f7f..81d56d3 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
@@ -63,6 +63,8 @@ public class SearchRateTrigger extends TriggerBase {
   public static final String MIN_REPLICAS_PROP = "minReplicas";
   public static final String ABOVE_RATE_PROP = "aboveRate";
   public static final String BELOW_RATE_PROP = "belowRate";
+  public static final String ABOVE_NODE_RATE_PROP = "aboveNodeRate";
+  public static final String BELOW_NODE_RATE_PROP = "belowNodeRate";
   public static final String ABOVE_OP_PROP = "aboveOp";
   public static final String BELOW_OP_PROP = "belowOp";
   public static final String ABOVE_NODE_OP_PROP = "aboveNodeOp";
@@ -81,6 +83,7 @@ public class SearchRateTrigger extends TriggerBase {
   public static final String COLD_COLLECTIONS = "coldCollections";
   public static final String COLD_SHARDS = "coldShards";
   public static final String COLD_REPLICAS = "coldReplicas";
+  public static final String VIOLATION_PROP = "violationType";
 
   public static final int DEFAULT_MAX_OPS = 3;
   public static final String DEFAULT_METRIC = "QUERY./select.requestTimes:1minRate";
@@ -93,6 +96,8 @@ public class SearchRateTrigger extends TriggerBase {
   private String node;
   private double aboveRate;
   private double belowRate;
+  private double aboveNodeRate;
+  private double belowNodeRate;
   private CollectionParams.CollectionAction aboveOp, belowOp, aboveNodeOp, belowNodeOp;
   private final Map<String, Long> lastCollectionEvent = new ConcurrentHashMap<>();
   private final Map<String, Long> lastNodeEvent = new ConcurrentHashMap<>();
@@ -117,6 +122,8 @@ public class SearchRateTrigger extends TriggerBase {
         BELOW_NODE_OP_PROP,
         ABOVE_RATE_PROP,
         BELOW_RATE_PROP,
+        ABOVE_NODE_RATE_PROP,
+        BELOW_NODE_RATE_PROP,
         // back-compat props
         BC_COLLECTION_PROP,
         BC_RATE_PROP);
@@ -192,6 +199,28 @@ public class SearchRateTrigger extends TriggerBase {
       belowRate = -1;
     }
 
+    // node rates
+    above = properties.get(ABOVE_NODE_RATE_PROP);
+    below = properties.get(BELOW_NODE_RATE_PROP);
+    if (above != null) {
+      try {
+        aboveNodeRate = Double.parseDouble(String.valueOf(above));
+      } catch (Exception e) {
+        throw new TriggerValidationException(name, ABOVE_NODE_RATE_PROP, "Invalid configuration value: '" + above + "': " + e.toString());
+      }
+    } else {
+      aboveNodeRate = Double.MAX_VALUE;
+    }
+    if (below != null) {
+      try {
+        belowNodeRate = Double.parseDouble(String.valueOf(below));
+      } catch (Exception e) {
+        throw new TriggerValidationException(name, BELOW_NODE_RATE_PROP, "Invalid configuration value: '" + below + "': " + e.toString());
+      }
+    } else {
+      belowNodeRate = -1;
+    }
+
     String aboveOpStr = String.valueOf(properties.getOrDefault(ABOVE_OP_PROP, CollectionParams.CollectionAction.ADDREPLICA.toLower()));
     String belowOpStr = String.valueOf(properties.getOrDefault(BELOW_OP_PROP, CollectionParams.CollectionAction.DELETEREPLICA.toLower()));
     aboveOp = CollectionParams.CollectionAction.get(aboveOpStr);
@@ -231,6 +260,8 @@ public class SearchRateTrigger extends TriggerBase {
     config.put(MIN_REPLICAS_PROP, minReplicas);
     config.put(ABOVE_RATE_PROP, aboveRate);
     config.put(BELOW_RATE_PROP, belowRate);
+    config.put(ABOVE_NODE_RATE_PROP, aboveNodeRate);
+    config.put(BELOW_NODE_RATE_PROP, belowNodeRate);
     config.put(ABOVE_OP_PROP, aboveOp);
     config.put(ABOVE_NODE_OP_PROP, aboveNodeOp);
     config.put(BELOW_OP_PROP, belowOp);
@@ -339,6 +370,10 @@ public class SearchRateTrigger extends TriggerBase {
         continue;
       }
       Map<String, Object> rates = cloudManager.getNodeStateProvider().getNodeValues(node, metricTags.keySet());
+      if (log.isDebugEnabled()) {
+        log.debug("### rates for node " + node);
+        rates.forEach((tag, rate) -> log.debug("###  " + tag + "\t" + rate));
+      }
       rates.forEach((tag, rate) -> {
         ReplicaInfo info = metricTags.get(tag);
         if (info == null) {
@@ -355,18 +390,28 @@ public class SearchRateTrigger extends TriggerBase {
       });
     }
 
+    if (log.isDebugEnabled()) {
+      collectionRates.forEach((coll, collRates) -> {
+        log.debug("## Collection: {}", coll);
+        collRates.forEach((s, replicas) -> {
+          log.debug("##  - {}", s);
+          replicas.forEach(ri -> log.debug("##     {}  {}", ri.getCore(), ri.getVariable(AutoScalingParams.RATE)));
+        });
+      });
+    }
     long now = cloudManager.getTimeSource().getTimeNs();
     Map<String, Double> hotNodes = new HashMap<>();
     Map<String, Double> coldNodes = new HashMap<>();
+
     // check for exceeded rates and filter out those with less than waitFor from previous events
     nodeRates.entrySet().stream()
         .filter(entry -> node.equals(Policy.ANY) || node.equals(entry.getKey()))
         .forEach(entry -> {
-          if (entry.getValue().get() > aboveRate) {
+          if (entry.getValue().get() > aboveNodeRate) {
             if (waitForElapsed(entry.getKey(), now, lastNodeEvent)) {
               hotNodes.put(entry.getKey(), entry.getValue().get());
             }
-          } else if (entry.getValue().get() < belowRate) {
+          } else if (entry.getValue().get() < belowNodeRate) {
             if (waitForElapsed(entry.getKey(), now, lastNodeEvent)) {
               coldNodes.put(entry.getKey(), entry.getValue().get());
             }
@@ -383,7 +428,7 @@ public class SearchRateTrigger extends TriggerBase {
     List<ReplicaInfo> coldReplicas = new ArrayList<>();
     collectionRates.forEach((coll, shardRates) -> {
       shardRates.forEach((sh, replicaRates) -> {
-        double shardRate = replicaRates.stream()
+        double totalShardRate = replicaRates.stream()
             .map(r -> {
               String elapsedKey = r.getCollection() + "." + r.getCore();
               if ((Double)r.getVariable(AutoScalingParams.RATE) > aboveRate) {
@@ -401,7 +446,10 @@ public class SearchRateTrigger extends TriggerBase {
               return r;
             })
             .mapToDouble(r -> (Double)r.getVariable(AutoScalingParams.RATE)).sum();
+        // calculate average shard rate over all searchable replicas (see SOLR-12470)
+        double shardRate = totalShardRate / searchableReplicationFactors.get(coll).get(sh).doubleValue();
         String elapsedKey = coll + "." + sh;
+        log.debug("-- {}: totalShardRate={}, shardRate={}", elapsedKey, totalShardRate, shardRate);
         if ((collections.isEmpty() || collections.contains(coll)) &&
             (shard.equals(Policy.ANY) || shard.equals(sh))) {
           if (shardRate > aboveRate) {
@@ -411,6 +459,13 @@ public class SearchRateTrigger extends TriggerBase {
           } else if (shardRate < belowRate) {
             if (waitForElapsed(elapsedKey, now, lastShardEvent)) {
               coldShards.computeIfAbsent(coll, s -> new HashMap<>()).put(sh, shardRate);
+              log.debug("-- coldShard waitFor elapsed {}", elapsedKey);
+            } else {
+              if (log.isDebugEnabled()) {
+                Long lastTime = lastShardEvent.computeIfAbsent(elapsedKey, s -> now);
+                long elapsed = TimeUnit.SECONDS.convert(now - lastTime, TimeUnit.NANOSECONDS);
+                log.debug("-- waitFor didn't elapse for {}, waitFor={}, elapsed={}", elapsedKey, getWaitForSecond(), elapsed);
+              }
             }
           } else {
             // no violation - clear waitForElapsed
@@ -511,9 +566,10 @@ public class SearchRateTrigger extends TriggerBase {
     });
 
     final List<TriggerEvent.Op> ops = new ArrayList<>();
+    final Set<String> violations = new HashSet<>();
 
-    calculateHotOps(ops, searchableReplicationFactors, hotNodes, hotCollections, hotShards, hotReplicas);
-    calculateColdOps(ops, clusterState, searchableReplicationFactors, coldNodes, coldCollections, coldShards, coldReplicas);
+    calculateHotOps(ops, violations, searchableReplicationFactors, hotNodes, hotCollections, hotShards, hotReplicas);
+    calculateColdOps(ops, violations, clusterState, searchableReplicationFactors, coldNodes, coldCollections, coldShards, coldReplicas);
 
     if (ops.isEmpty()) {
       return;
@@ -521,7 +577,7 @@ public class SearchRateTrigger extends TriggerBase {
 
     if (processor.process(new SearchRateEvent(getName(), eventTime.get(), ops,
         hotNodes, hotCollections, hotShards, hotReplicas,
-        coldNodes, coldCollections, coldShards, coldReplicas))) {
+        coldNodes, coldCollections, coldShards, coldReplicas, violations))) {
       // update lastEvent times
       hotNodes.keySet().forEach(node -> lastNodeEvent.put(node, now));
       coldNodes.keySet().forEach(node -> lastNodeEvent.put(node, now));
@@ -537,6 +593,7 @@ public class SearchRateTrigger extends TriggerBase {
   }
 
   private void calculateHotOps(List<TriggerEvent.Op> ops,
+                               Set<String> violations,
                                Map<String, Map<String, AtomicInteger>> searchableReplicationFactors,
                                Map<String, Double> hotNodes,
                                Map<String, Double> hotCollections,
@@ -545,40 +602,47 @@ public class SearchRateTrigger extends TriggerBase {
     // calculate the number of replicas to add to each hot shard, based on how much the rate was
     // exceeded - but within limits.
 
-    // first resolve a situation when only a node is hot but no collection / shard / replica is hot
+    // first resolve a situation when only a node is hot but no collection / shard is hot
     // TODO: eventually we may want to commission a new node
-    if (!hotNodes.isEmpty() && hotShards.isEmpty() && hotCollections.isEmpty() && hotReplicas.isEmpty()) {
-      // move replicas around
-      if (aboveNodeOp != null) {
-        hotNodes.forEach((n, r) -> {
-          ops.add(new TriggerEvent.Op(aboveNodeOp, Suggester.Hint.SRC_NODE, n));
-        });
-      }
-    } else {
-      // add replicas
-      Map<String, Map<String, List<Pair<String, String>>>> hints = new HashMap<>();
-
-      hotShards.forEach((coll, shards) -> shards.forEach((s, r) -> {
-        List<Pair<String, String>> perShard = hints
-            .computeIfAbsent(coll, c -> new HashMap<>())
-            .computeIfAbsent(s, sh -> new ArrayList<>());
-        addReplicaHints(coll, s, r, searchableReplicationFactors.get(coll).get(s).get(), perShard);
-      }));
-      hotReplicas.forEach(ri -> {
-        double r = (Double)ri.getVariable(AutoScalingParams.RATE);
-        // add only if not already accounted for in hotShards
-        List<Pair<String, String>> perShard = hints
-            .computeIfAbsent(ri.getCollection(), c -> new HashMap<>())
-            .computeIfAbsent(ri.getShard(), sh -> new ArrayList<>());
-        if (perShard.isEmpty()) {
-          addReplicaHints(ri.getCollection(), ri.getShard(), r, searchableReplicationFactors.get(ri.getCollection()).get(ri.getShard()).get(), perShard);
+    if (!hotNodes.isEmpty()) {
+      if (hotShards.isEmpty() && hotCollections.isEmpty()) {
+        // move replicas around
+        if (aboveNodeOp != null) {
+          hotNodes.forEach((n, r) -> {
+            ops.add(new TriggerEvent.Op(aboveNodeOp, Suggester.Hint.SRC_NODE, n));
+            violations.add(HOT_NODES);
+          });
         }
-      });
-
-      hints.values().forEach(m -> m.values().forEach(lst -> lst.forEach(p -> {
-        ops.add(new TriggerEvent.Op(aboveOp, Suggester.Hint.COLL_SHARD, p));
-      })));
+      } else {
+        // ignore - hot shards will result in changes that will change hot node status anyway
+      }
     }
+    // add replicas
+    Map<String, Map<String, List<Pair<String, String>>>> hints = new HashMap<>();
+
+    // HOT COLLECTIONS
+    // currently we don't do anything for hot collections. Theoretically we could add
+    // 1 replica more to each shard, based on how close to the threshold each shard is
+    // but it's probably better to wait for a shard to become hot and be more precise.
+
+    // HOT SHARDS
+
+    hotShards.forEach((coll, shards) -> shards.forEach((s, r) -> {
+      List<Pair<String, String>> perShard = hints
+          .computeIfAbsent(coll, c -> new HashMap<>())
+          .computeIfAbsent(s, sh -> new ArrayList<>());
+      addReplicaHints(coll, s, r, searchableReplicationFactors.get(coll).get(s).get(), perShard);
+      violations.add(HOT_SHARDS);
+    }));
+
+    // HOT REPLICAS
+    // Hot replicas (while their shards are not hot) may be caused by
+    // dumb clients that use direct replica URLs - this is beyond our control
+    // so ignore them.
+
+    hints.values().forEach(m -> m.values().forEach(lst -> lst.forEach(p -> {
+      ops.add(new TriggerEvent.Op(aboveOp, Suggester.Hint.COLL_SHARD, p));
+    })));
 
   }
 
@@ -601,6 +665,7 @@ public class SearchRateTrigger extends TriggerBase {
   }
 
   private void calculateColdOps(List<TriggerEvent.Op> ops,
+                                Set<String> violations,
                                 ClusterState clusterState,
                                 Map<String, Map<String, AtomicInteger>> searchableReplicationFactors,
                                 Map<String, Double> coldNodes,
@@ -611,12 +676,13 @@ public class SearchRateTrigger extends TriggerBase {
     // Probably can't do anything reasonable about whole cold collections
     // because they may be needed even if not used.
 
-    // COLD SHARDS:
-    // Cold shards mean that there are too many replicas per shard - but it also
-    // means that all replicas in these shards are cold too, so we can simply
-    // address this by deleting cold replicas
+    // COLD SHARDS & COLD REPLICAS:
+    // We remove cold replicas only from cold shards, otherwise we are susceptible to uneven
+    // replica routing (which is beyond our control).
+    // If we removed replicas from non-cold shards we could accidentally bring that shard into
+    // the hot range, which would result in adding replica, and that replica could again stay cold due to
+    // the same routing issue, which then would lead to removing that replica, etc, etc...
 
-    // COLD REPLICAS:
     // Remove cold replicas but only when there's at least a minimum number of searchable
     // replicas still available (additional non-searchable replicas may exist, too)
     // NOTE: do this before adding ops for DELETENODE because we don't want to attempt
@@ -627,11 +693,18 @@ public class SearchRateTrigger extends TriggerBase {
           .computeIfAbsent(ri.getShard(), s -> new ArrayList<>())
           .add(ri);
     });
-    byCollectionByShard.forEach((coll, shards) -> {
-      shards.forEach((shard, replicas) -> {
+    coldShards.forEach((coll, perShard) -> {
+      perShard.forEach((shard, rate) -> {
+        List<ReplicaInfo> replicas = byCollectionByShard
+            .getOrDefault(coll, Collections.emptyMap())
+            .getOrDefault(shard, Collections.emptyList());
+        if (replicas.isEmpty()) {
+          return;
+        }
         // only delete if there's at least minRF searchable replicas left
         int rf = searchableReplicationFactors.get(coll).get(shard).get();
-        // we only really need a leader and we may be allowed to remove other replicas
+        // assume first that we only really need a leader and we may be
+        // allowed to remove other replicas
         int minRF = 1;
         // but check the official RF and don't go below that
         Integer RF = clusterState.getCollection(coll).getReplicationFactor();
@@ -660,6 +733,7 @@ public class SearchRateTrigger extends TriggerBase {
                 Suggester.Hint.COLL_SHARD, new Pair<>(ri.getCollection(), ri.getShard()));
             op.addHint(Suggester.Hint.REPLICA, ri.getName());
             ops.add(op);
+            violations.add(COLD_SHARDS);
             limit.decrementAndGet();
           });
         }
@@ -669,7 +743,7 @@ public class SearchRateTrigger extends TriggerBase {
     // COLD NODES:
     // Unlike the case of hot nodes, if a node is cold then any monitored
     // collections / shards / replicas located on that node are cold, too.
-    // HOWEVER, we check only non-pull replicas and only from selected collections / shards,
+    // HOWEVER, we check only replicas from selected collections / shards,
     // so deleting a cold node is dangerous because it may interfere with these
     // non-monitored resources - this is the reason the default belowNodeOp is null / ignored.
     //
@@ -679,6 +753,7 @@ public class SearchRateTrigger extends TriggerBase {
     if (belowNodeOp != null) {
       coldNodes.forEach((node, rate) -> {
         ops.add(new TriggerEvent.Op(belowNodeOp, Suggester.Hint.SRC_NODE, node));
+        violations.add(COLD_NODES);
       });
     }
 
@@ -688,7 +763,7 @@ public class SearchRateTrigger extends TriggerBase {
   private boolean waitForElapsed(String name, long now, Map<String, Long> lastEventMap) {
     Long lastTime = lastEventMap.computeIfAbsent(name, s -> now);
     long elapsed = TimeUnit.SECONDS.convert(now - lastTime, TimeUnit.NANOSECONDS);
-    log.trace("name={}, lastTime={}, elapsed={}", name, lastTime, elapsed);
+    log.trace("name={}, lastTime={}, elapsed={}, waitFor={}", name, lastTime, elapsed, getWaitForSecond());
     if (TimeUnit.SECONDS.convert(now - lastTime, TimeUnit.NANOSECONDS) < getWaitForSecond()) {
       return false;
     }
@@ -704,7 +779,8 @@ public class SearchRateTrigger extends TriggerBase {
                            Map<String, Double> coldNodes,
                            Map<String, Double> coldCollections,
                            Map<String, Map<String, Double>> coldShards,
-                           List<ReplicaInfo> coldReplicas) {
+                           List<ReplicaInfo> coldReplicas,
+                           Set<String> violations) {
       super(TriggerEventType.SEARCHRATE, source, eventTime, null);
       properties.put(TriggerEvent.REQUESTED_OPS, ops);
       properties.put(HOT_NODES, hotNodes);
@@ -715,6 +791,7 @@ public class SearchRateTrigger extends TriggerBase {
       properties.put(COLD_COLLECTIONS, coldCollections);
       properties.put(COLD_SHARDS, coldShards);
       properties.put(COLD_REPLICAS, coldReplicas);
+      properties.put(VIOLATION_PROP, violations);
     }
   }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionException.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionException.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionException.java
new file mode 100644
index 0000000..624ce68
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionException.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.cloud.autoscaling;
+
+/**
+ * Trigger action-specific exception.
+ */
+public class TriggerActionException extends Exception {
+
+  public final String triggerName;
+  public final String actionName;
+
+  public TriggerActionException(String triggerName, String actionName, String message, Throwable cause) {
+    super(message, cause);
+    this.triggerName = triggerName;
+    this.actionName = actionName;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java b/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java
index d277e31..d8fe78b 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java
@@ -53,7 +53,7 @@ public class ActionThrottleTest extends SolrTestCaseJ4 {
 
     @Override
     public void sleep(long ms) throws InterruptedException {
-      throw new UnsupportedOperationException();
+      TimeSource.NANO_TIME.sleep(ms);
     }
 
     @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java b/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java
index 448889f..b67b551 100644
--- a/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java
+++ b/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java
@@ -20,6 +20,7 @@ package org.apache.solr.cloud;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.Collection;
+import java.util.HashSet;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
@@ -118,7 +119,7 @@ public class CloudTestUtils {
    * @param expectedReplicas expected number of active replicas
    */
   public static CollectionStatePredicate clusterShape(int expectedShards, int expectedReplicas) {
-    return clusterShape(expectedShards, expectedReplicas, false);
+    return clusterShape(expectedShards, expectedReplicas, false, false);
   }
 
   /**
@@ -129,30 +130,46 @@ public class CloudTestUtils {
    * @param expectedShards expected number of shards
    * @param expectedReplicas expected number of active replicas
    * @param withInactive if true then count also inactive shards
+   * @param requireLeaders if true then require that each shard has a leader
    */
-  public static CollectionStatePredicate clusterShape(int expectedShards, int expectedReplicas, boolean withInactive) {
+  public static CollectionStatePredicate clusterShape(int expectedShards, int expectedReplicas, boolean withInactive,
+                                                      boolean requireLeaders) {
     return (liveNodes, collectionState) -> {
       if (collectionState == null) {
-        log.debug("-- null collection");
+        log.trace("-- null collection");
         return false;
       }
       Collection<Slice> slices = withInactive ? collectionState.getSlices() : collectionState.getActiveSlices();
       if (slices.size() != expectedShards) {
-        log.debug("-- wrong number of active slices, expected=" + expectedShards + ", found=" + collectionState.getSlices().size());
+        log.trace("-- wrong number of active slices, expected={}, found={}", expectedShards, collectionState.getSlices().size());
         return false;
       }
+      Set<String> leaderless = new HashSet<>();
       for (Slice slice : slices) {
         int activeReplicas = 0;
+        if (requireLeaders && slice.getLeader() == null) {
+          leaderless.add(slice.getName());
+          continue;
+        }
+        // skip other checks, we're going to fail anyway
+        if (!leaderless.isEmpty()) {
+          continue;
+        }
         for (Replica replica : slice) {
           if (replica.isActive(liveNodes))
             activeReplicas++;
         }
         if (activeReplicas != expectedReplicas) {
-          log.debug("-- wrong number of active replicas in slice " + slice.getName() + ", expected=" + expectedReplicas + ", found=" + activeReplicas);
+          log.trace("-- wrong number of active replicas in slice {}, expected={}, found={}", slice.getName(), expectedReplicas, activeReplicas);
           return false;
         }
       }
-      return true;
+      if (leaderless.isEmpty()) {
+        return true;
+      } else {
+        log.trace("-- shards without leaders: {}", leaderless);
+        return false;
+      }
     };
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java
index daf9867..e591c57 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java
@@ -67,7 +67,6 @@ import static org.apache.solr.common.cloud.ZkStateReader.SOLR_AUTOSCALING_CONF_P
  *
  */
 @LogLevel("org.apache.solr.cloud.autoscaling=DEBUG")
-//05-Jul-2018 @LuceneTestCase.BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12392")
 public class IndexSizeTriggerTest extends SolrCloudTestCase {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
@@ -116,6 +115,10 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
     if (cloudManager instanceof SimCloudManager) {
       log.info(((SimCloudManager) cloudManager).dumpClusterState(true));
       ((SimCloudManager) cloudManager).getSimClusterStateProvider().simDeleteAllCollections();
+      ((SimCloudManager) cloudManager).simClearSystemCollection();
+      ((SimCloudManager) cloudManager).getSimClusterStateProvider().simResetLeaderThrottles();
+      ((SimCloudManager) cloudManager).simRestartOverseer(null);
+      cloudManager.getTimeSource().sleep(500);
       ((SimCloudManager) cloudManager).simResetOpCounts();
     } else {
       cluster.deleteAllCollections();
@@ -137,14 +140,14 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
   }
 
   @Test
-  @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
+  //@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
   public void testTrigger() throws Exception {
     String collectionName = "testTrigger_collection";
     CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionName,
         "conf", 2, 2).setMaxShardsPerNode(2);
     create.process(solrClient);
     CloudTestUtils.waitForState(cloudManager, "failed to create " + collectionName, collectionName,
-        CloudTestUtils.clusterShape(2, 2));
+        CloudTestUtils.clusterShape(2, 2, false, true));
 
     long waitForSeconds = 3 + random().nextInt(5);
     Map<String, Object> props = createTriggerProps(waitForSeconds);
@@ -235,14 +238,14 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
   }
 
   @Test
-  @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
+  //@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
   public void testSplitIntegration() throws Exception {
     String collectionName = "testSplitIntegration_collection";
     CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionName,
         "conf", 2, 2).setMaxShardsPerNode(2);
     create.process(solrClient);
     CloudTestUtils.waitForState(cloudManager, "failed to create " + collectionName, collectionName,
-        CloudTestUtils.clusterShape(2, 2));
+        CloudTestUtils.clusterShape(2, 2, false, true));
 
     long waitForSeconds = 3 + random().nextInt(5);
     // add disabled trigger
@@ -310,7 +313,7 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
 
     boolean await = finished.await(60000 / SPEED, TimeUnit.MILLISECONDS);
     assertTrue("did not finish processing in time", await);
-    CloudTestUtils.waitForState(cloudManager, collectionName, 20, TimeUnit.SECONDS, CloudTestUtils.clusterShape(6, 2, true));
+    CloudTestUtils.waitForState(cloudManager, collectionName, 20, TimeUnit.SECONDS, CloudTestUtils.clusterShape(6, 2, true, true));
     assertEquals(1, listenerEvents.size());
     List<CapturedEvent> events = listenerEvents.get("capturing");
     assertNotNull("'capturing' events not found", events);
@@ -348,14 +351,14 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
   }
 
   @Test
-  @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
+  //@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
   public void testMergeIntegration() throws Exception {
     String collectionName = "testMergeIntegration_collection";
     CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionName,
         "conf", 2, 2).setMaxShardsPerNode(2);
     create.process(solrClient);
     CloudTestUtils.waitForState(cloudManager, "failed to create " + collectionName, collectionName,
-        CloudTestUtils.clusterShape(2, 2));
+        CloudTestUtils.clusterShape(2, 2, false, true));
 
     for (int i = 0; i < 10; i++) {
       SolrInputDocument doc = new SolrInputDocument("id", "id-" + (i * 100));
@@ -458,7 +461,7 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
   }
 
   @Test
-  @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
+  //@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
   public void testMixedBounds() throws Exception {
 //    if (cloudManager instanceof SimCloudManager) {
 //      log.warn("Requires SOLR-12208");
@@ -470,7 +473,7 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
         "conf", 2, 2).setMaxShardsPerNode(2);
     create.process(solrClient);
     CloudTestUtils.waitForState(cloudManager, "failed to create " + collectionName, collectionName,
-        CloudTestUtils.clusterShape(2, 2));
+        CloudTestUtils.clusterShape(2, 2, false, true));
 
     for (int j = 0; j < 10; j++) {
       UpdateRequest ureq = new UpdateRequest();
@@ -583,7 +586,7 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
 
     // collection should have 2 inactive and 4 active shards
     CloudTestUtils.waitForState(cloudManager, "failed to create " + collectionName, collectionName,
-        CloudTestUtils.clusterShape(6, 2, true));
+        CloudTestUtils.clusterShape(6, 2, true, true));
 
     // check ops
     List<TriggerEvent.Op> ops = (List<TriggerEvent.Op>) events.get(4).event.getProperty(TriggerEvent.REQUESTED_OPS);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java
index 164db8f..56896a7 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java
@@ -165,7 +165,7 @@ public class ScheduledMaintenanceTriggerTest extends SolrCloudTestCase {
         .setShardName("shard1");
     split1.process(solrClient);
     CloudTestUtils.waitForState(cloudManager, "failed to split " + collection1, collection1,
-        CloudTestUtils.clusterShape(3, 1, true));
+        CloudTestUtils.clusterShape(3, 1, true, true));
 
     String setListenerCommand = "{" +
         "'set-listener' : " +

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java
index f8492d9..19ac4b3 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java
@@ -235,9 +235,8 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
     assertTrue(TimeUnit.SECONDS.convert(waitForSeconds, TimeUnit.NANOSECONDS) - WAIT_FOR_DELTA_NANOS <= now - ev.event.getEventTime());
     Map<String, Double> nodeRates = (Map<String, Double>) ev.event.getProperties().get(SearchRateTrigger.HOT_NODES);
     assertNotNull("nodeRates", nodeRates);
-    assertTrue(nodeRates.toString(), nodeRates.size() > 0);
-    AtomicDouble totalNodeRate = new AtomicDouble();
-    nodeRates.forEach((n, r) -> totalNodeRate.addAndGet(r));
+    // no node violations because node rates weren't set in the config
+    assertTrue(nodeRates.toString(), nodeRates.isEmpty());
     List<ReplicaInfo> replicaRates = (List<ReplicaInfo>) ev.event.getProperties().get(SearchRateTrigger.HOT_REPLICAS);
     assertNotNull("replicaRates", replicaRates);
     assertTrue(replicaRates.toString(), replicaRates.size() > 0);
@@ -260,8 +259,8 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
     Double collectionRate = collectionRates.get(COLL1);
     assertNotNull(collectionRate);
     assertTrue(collectionRate > 5.0);
-    assertEquals(collectionRate, totalNodeRate.get(), 5.0);
-    assertEquals(collectionRate, totalShardRate.get(), 5.0);
+    // two replicas - the trigger calculates average over all searchable replicas
+    assertEquals(collectionRate / 2, totalShardRate.get(), 5.0);
     assertEquals(collectionRate, totalReplicaRate.get(), 5.0);
 
     // check operations
@@ -302,7 +301,11 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
         "'enabled' : false," +
         "'collections' : '" + COLL1 + "'," +
         "'aboveRate' : 1.0," +
-        "'belowRate' : 0.1," +
+        // RecoveryStrategy calls /admin/ping, which calls /select so this may not be zero
+        // even when no external requests were made
+        "'belowRate' : 0.3," +
+        "'aboveNodeRate' : 1.0," +
+        "'belowNodeRate' : 0.3," +
         // do nothing but generate an op
         "'belowNodeOp' : 'none'," +
         "'actions' : [" +
@@ -383,7 +386,7 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
     response = solrClient.request(req);
     assertEquals(response.get("result").toString(), "success");
 
-    timeSource.sleep(5000);
+    timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS));
 
     List<CapturedEvent> events = listenerEvents.get("srt");
     assertEquals(events.toString(), 3, events.size());
@@ -442,13 +445,19 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
     assertEquals(ev.toString(), "compute", ev.actionName);
     ops = (List<TriggerEvent.Op>)ev.event.getProperty(TriggerEvent.REQUESTED_OPS);
     assertNotNull("there should be some requestedOps: " + ev.toString(), ops);
-    assertEquals(ops.toString(), 1, ops.size());
+    assertEquals(ops.toString(), 2, ops.size());
     assertEquals(ops.toString(), CollectionParams.CollectionAction.NONE, ops.get(0).getAction());
+    assertEquals(ops.toString(), CollectionParams.CollectionAction.NONE, ops.get(1).getAction());
+
+    // wait for waitFor to elapse for all types of violations
+    timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds * 2, TimeUnit.SECONDS));
 
     listenerEvents.clear();
     finished = new CountDownLatch(1);
     started = new CountDownLatch(1);
 
+    log.info("## test single replicas.");
+
     // now allow single replicas
     setTriggerCommand = "{" +
         "'set-trigger' : {" +
@@ -458,7 +467,9 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
         "'enabled' : true," +
         "'collections' : '" + COLL1 + "'," +
         "'aboveRate' : 1.0," +
-        "'belowRate' : 0.1," +
+        "'belowRate' : 0.3," +
+        "'aboveNodeRate' : 1.0," +
+        "'belowNodeRate' : 0.3," +
         "'minReplicas' : 1," +
         "'belowNodeOp' : 'none'," +
         "'actions' : [" +
@@ -491,21 +502,20 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
     assertEquals(ev.toString(), "compute", ev.actionName);
     ops = (List<TriggerEvent.Op>)ev.event.getProperty(TriggerEvent.REQUESTED_OPS);
     assertNotNull("there should be some requestedOps: " + ev.toString(), ops);
-    assertEquals(ops.toString(), 2, ops.size());
+
+    assertTrue(ops.toString(), ops.size() > 0);
     AtomicInteger coldNodes2 = new AtomicInteger();
-    AtomicInteger coldReplicas2 = new AtomicInteger();
     ops.forEach(op -> {
       if (op.getAction().equals(CollectionParams.CollectionAction.NONE)) {
         coldNodes2.incrementAndGet();
       } else if (op.getAction().equals(CollectionParams.CollectionAction.DELETEREPLICA)) {
-        coldReplicas2.incrementAndGet();
+        // ignore
       } else {
         fail("unexpected op: " + op);
       }
     });
 
-    assertEquals("coldNodes", 1, coldNodes2.get());
-    assertEquals("colReplicas", 1, coldReplicas2.get());
+    assertEquals("coldNodes: " +ops.toString(), 2, coldNodes2.get());
 
     // now the collection should be at RF == 1, with one additional PULL replica
     CloudTestUtils.waitForState(cloudManager, COLL1, 60, TimeUnit.SECONDS,
@@ -543,6 +553,9 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
         "'collections' : '" + COLL1 + "'," +
         "'aboveRate' : 1.0," +
         "'belowRate' : 0.1," +
+        // set limits to node rates
+        "'aboveNodeRate' : 1.0," +
+        "'belowNodeRate' : 0.1," +
         // allow deleting all spare replicas
         "'minReplicas' : 1," +
         // allow requesting all deletions in one event

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java
index 60b973f..0e9f4fb 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java
@@ -174,11 +174,11 @@ public class SearchRateTriggerTest extends SolrCloudTestCase {
       }
       Thread.sleep(waitForSeconds * 1000);
       trigger.run();
-      // should generate node and collection event but not for COLL2 because of waitFor
+      // should generate collection event but not for COLL2 because of waitFor
       assertEquals(1, events.size());
       event = events.get(0);
       Map<String, Double> hotNodes = (Map<String, Double>)event.getProperty(SearchRateTrigger.HOT_NODES);
-      assertEquals(3, hotNodes.size());
+      assertTrue("hotNodes", hotNodes.isEmpty());
       hotNodes.forEach((n, r) -> assertTrue(n, r > rate));
       hotCollections = (Map<String, Double>)event.getProperty(SearchRateTrigger.HOT_COLLECTIONS);
       assertEquals(1, hotCollections.size());
@@ -193,7 +193,7 @@ public class SearchRateTriggerTest extends SolrCloudTestCase {
 
       Thread.sleep(waitForSeconds * 1000 * 2);
       trigger.run();
-      // should generate node and collection event
+      // should generate collection event
       assertEquals(1, events.size());
       event = events.get(0);
       hotCollections = (Map<String, Double>)event.getProperty(SearchRateTrigger.HOT_COLLECTIONS);
@@ -203,8 +203,7 @@ public class SearchRateTriggerTest extends SolrCloudTestCase {
       Rate = hotCollections.get(COLL2);
       assertNotNull(Rate);
       hotNodes = (Map<String, Double>)event.getProperty(SearchRateTrigger.HOT_NODES);
-      assertEquals(3, hotNodes.size());
-      hotNodes.forEach((n, r) -> assertTrue(n, r > rate));
+      assertTrue("hotNodes", hotNodes.isEmpty());
     }
   }
 
@@ -266,7 +265,7 @@ public class SearchRateTriggerTest extends SolrCloudTestCase {
       hotCollections = (Map<String, Object>)event.properties.get(SearchRateTrigger.HOT_COLLECTIONS);
       hotShards = (Map<String, Object>)event.properties.get(SearchRateTrigger.HOT_SHARDS);
       hotReplicas = (List<ReplicaInfo>)event.properties.get(SearchRateTrigger.HOT_REPLICAS);
-      assertFalse("no hot nodes?", hotNodes.isEmpty());
+      assertTrue("no hot nodes?", hotNodes.isEmpty());
       assertFalse("no hot collections?", hotCollections.isEmpty());
       assertFalse("no hot shards?", hotShards.isEmpty());
       assertFalse("no hot replicas?", hotReplicas.isEmpty());

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
index abc3ccf..741a868 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
@@ -53,6 +53,7 @@ import org.apache.solr.client.solrj.impl.ClusterStateProvider;
 import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.client.solrj.response.UpdateResponse;
 import org.apache.solr.cloud.ActionThrottle;
+import org.apache.solr.cloud.CloudTestUtils;
 import org.apache.solr.cloud.Overseer;
 import org.apache.solr.cloud.api.collections.AddReplicaCmd;
 import org.apache.solr.cloud.api.collections.Assign;
@@ -143,6 +144,8 @@ public class SimClusterStateProvider implements ClusterStateProvider {
   private AtomicReference<Map<String, DocCollection>> collectionsStatesRef = new AtomicReference<>();
   private AtomicBoolean saveClusterState = new AtomicBoolean();
 
+  private transient boolean closed;
+
   /**
    * The instance needs to be initialized using the <code>sim*</code> methods in order
    * to ensure proper behavior, otherwise it will behave as a cluster with zero replicas.
@@ -243,7 +246,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
       return null;
     }
     for (ReplicaInfo ri : list) {
-      if (r.getName().equals(ri.getName())) {
+      if (r.getCoreName().equals(ri.getCore())) {
         return ri;
       }
     }
@@ -255,6 +258,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
    * @param nodeId unique node id
    */
   public void simAddNode(String nodeId) throws Exception {
+    ensureNotClosed();
     if (liveNodes.contains(nodeId)) {
       throw new Exception("Node " + nodeId + " already exists");
     }
@@ -271,6 +275,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
    * @return true if a node existed and was removed
    */
   public boolean simRemoveNode(String nodeId) throws Exception {
+    ensureNotClosed();
     lock.lockInterruptibly();
     try {
       Set<String> collections = new HashSet<>();
@@ -346,6 +351,9 @@ public class SimClusterStateProvider implements ClusterStateProvider {
     if (replicas != null) {
       replicas.forEach(r -> {
         r.getVariables().put(ZkStateReader.STATE_PROP, state.toString());
+        if (state != Replica.State.ACTIVE) {
+          r.getVariables().remove(ZkStateReader.LEADER_PROP);
+        }
         changedCollections.add(r.getCollection());
       });
     }
@@ -433,6 +441,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
    * @param runLeaderElection if true then run a leader election after adding the replica.
    */
   public void simAddReplica(String nodeId, ReplicaInfo replicaInfo, boolean runLeaderElection) throws Exception {
+    ensureNotClosed();
     // make sure SolrCore name is unique across cluster and coreNodeName within collection
     for (Map.Entry<String, List<ReplicaInfo>> e : nodeReplicaMap.entrySet()) {
       for (ReplicaInfo ri : e.getValue()) {
@@ -468,8 +477,6 @@ public class SimClusterStateProvider implements ClusterStateProvider {
 
       opDelay(replicaInfo.getCollection(), CollectionParams.CollectionAction.ADDREPLICA.name());
 
-      // at this point nuke our cached DocCollection state
-      collectionsStatesRef.set(null);
       List<ReplicaInfo> replicas = nodeReplicaMap.computeIfAbsent(nodeId, n -> new ArrayList<>());
       // mark replica as active
       replicaInfo.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.ACTIVE.toString());
@@ -481,7 +488,6 @@ public class SimClusterStateProvider implements ClusterStateProvider {
       }
 
       replicas.add(replicaInfo);
-      LOG.trace("-- simAddReplica {}", replicaInfo);
 
       Map<String, Object> values = cloudManager.getSimNodeStateProvider().simGetAllNodeValues()
           .computeIfAbsent(nodeId, id -> new ConcurrentHashMap<>(SimCloudManager.createNodeValues(id)));
@@ -505,6 +511,9 @@ public class SimClusterStateProvider implements ClusterStateProvider {
       cloudManager.getMetricManager().registerGauge(null, registry,
           () -> replicaInfo.getVariable(Type.CORE_IDX.metricsAttribute),
           "", true, "INDEX.sizeInBytes");
+      // at this point nuke our cached DocCollection state
+      collectionsStatesRef.set(null);
+      LOG.trace("-- simAddReplica {}", replicaInfo);
       if (runLeaderElection) {
         simRunLeaderElection(Collections.singleton(replicaInfo.getCollection()), true);
       }
@@ -519,6 +528,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
    * @param coreNodeName coreNodeName
    */
   public void simRemoveReplica(String nodeId, String coreNodeName) throws Exception {
+    ensureNotClosed();
     lock.lockInterruptibly();
     List<ReplicaInfo> replicas = nodeReplicaMap.computeIfAbsent(nodeId, n -> new ArrayList<>());
     try {
@@ -591,20 +601,30 @@ public class SimClusterStateProvider implements ClusterStateProvider {
    * @param saveClusterState if true then save cluster state regardless of changes.
    */
   private void simRunLeaderElection(Collection<String> collections, boolean saveClusterState) throws Exception {
-    ClusterState state = getClusterState();
+    ensureNotClosed();
     if (saveClusterState) {
       collectionsStatesRef.set(null);
     }
+    ClusterState state = getClusterState();
     state.forEachCollection(dc -> {
       if (!collections.contains(dc.getName())) {
         return;
       }
-      dc.getSlices().forEach(s ->
+      dc.getSlices().forEach(s -> {
+        if (s.getLeader() != null) {
+          LOG.debug("-- already has leader {} / {}", dc.getName(), s.getName());
+          return;
+        }
+        if (s.getReplicas().isEmpty()) {
+          LOG.debug("-- no replicas in {} / {}", dc.getName(), s.getName());
+          return;
+        }
+        LOG.debug("-- submit leader election for {} / {}", dc.getName(), s.getName());
         cloudManager.submit(() -> {
           simRunLeaderElection(dc.getName(), s, saveClusterState);
           return true;
-        })
-      );
+        });
+      });
     });
   }
 
@@ -612,59 +632,73 @@ public class SimClusterStateProvider implements ClusterStateProvider {
     AtomicBoolean stateChanged = new AtomicBoolean(Boolean.FALSE);
     Replica leader = s.getLeader();
     if (leader == null || !liveNodes.contains(leader.getNodeName())) {
-      LOG.trace("Running leader election for " + collection + " / " + s.getName());
+      LOG.debug("Running leader election for {} / {}", collection, s.getName());
       if (s.getReplicas().isEmpty()) { // no replicas - punt
+        LOG.debug("-- no replicas in {} / {}", collection, s.getName());
         return;
       }
       ActionThrottle lt = getThrottle(collection, s.getName());
-      lt.minimumWaitBetweenActions();
-      lt.markAttemptingAction();
-
-      // mark all replicas as non-leader (probably not necessary) and collect all active and live
-      List<ReplicaInfo> active = new ArrayList<>();
-      s.getReplicas().forEach(r -> {
-        // find our ReplicaInfo for this replica
-        ReplicaInfo ri = getReplicaInfo(r);
-        if (ri == null) {
-          throw new IllegalStateException("-- could not find ReplicaInfo for replica " + r);
-        }
-        synchronized (ri) {
-          if (ri.getVariables().remove(ZkStateReader.LEADER_PROP) != null) {
-            stateChanged.set(true);
+      synchronized (lt) {
+        // collect all active and live
+        List<ReplicaInfo> active = new ArrayList<>();
+        AtomicBoolean alreadyHasLeader = new AtomicBoolean(false);
+        s.getReplicas().forEach(r -> {
+          // find our ReplicaInfo for this replica
+          ReplicaInfo ri = getReplicaInfo(r);
+          if (ri == null) {
+            throw new IllegalStateException("-- could not find ReplicaInfo for replica " + r);
           }
-          if (r.isActive(liveNodes.get())) {
-            active.add(ri);
-          } else { // if it's on a node that is not live mark it down
-            if (!liveNodes.contains(r.getNodeName())) {
-              ri.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.DOWN.toString());
-              stateChanged.set(true);
+          synchronized (ri) {
+            if (r.isActive(liveNodes.get())) {
+              if (ri.getVariables().get(ZkStateReader.LEADER_PROP) != null) {
+                LOG.trace("-- found existing leader {} / {}: {}, {}", collection, s.getName(), ri, r);
+                alreadyHasLeader.set(true);
+                return;
+              } else {
+                active.add(ri);
+              }
+            } else { // if it's on a node that is not live mark it down
+              LOG.trace("-- replica not active on live nodes: {}, {}", liveNodes.get(), r);
+              if (!liveNodes.contains(r.getNodeName())) {
+                ri.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.DOWN.toString());
+                ri.getVariables().remove(ZkStateReader.LEADER_PROP);
+                stateChanged.set(true);
+              }
             }
           }
+        });
+        if (alreadyHasLeader.get()) {
+          LOG.debug("-- already has leader {} / {}: {}", collection, s.getName(), s);
+          return;
         }
-      });
-      if (active.isEmpty()) {
-        LOG.warn("-- can't find any active replicas for " + collection + " / " + s.getName());
-        return;
-      }
-      // pick first active one
-      ReplicaInfo ri = null;
-      for (ReplicaInfo a : active) {
-        if (!a.getType().equals(Replica.Type.PULL)) {
-          ri = a;
-          break;
+        if (active.isEmpty()) {
+          LOG.warn("Can't find any active replicas for {} / {}: {}", collection, s.getName(), s);
+          LOG.debug("-- liveNodes: {}", liveNodes.get());
+          return;
         }
+        // pick first active one
+        ReplicaInfo ri = null;
+        for (ReplicaInfo a : active) {
+          if (!a.getType().equals(Replica.Type.PULL)) {
+            ri = a;
+            break;
+          }
+        }
+        if (ri == null) {
+          LOG.warn("-- can't find any suitable replica type for {} / {}: {}", collection, s.getName(), s);
+          return;
+        }
+        // now mark the leader election throttle
+        lt.minimumWaitBetweenActions();
+        lt.markAttemptingAction();
+        synchronized (ri) {
+          ri.getVariables().put(ZkStateReader.LEADER_PROP, "true");
+        }
+        stateChanged.set(true);
+        LOG.debug("-- elected new leader for " + collection + " / " + s.getName() + ": " + ri.getName());
       }
-      if (ri == null) {
-        LOG.warn("-- can't find any suitable replica type for " + collection + " / " + s.getName());
-        return;
-      }
-      synchronized (ri) {
-        ri.getVariables().put(ZkStateReader.LEADER_PROP, "true");
-      }
-      stateChanged.set(true);
-      LOG.debug("-- elected new leader for " + collection + " / " + s.getName() + ": " + ri);
     } else {
-      LOG.trace("-- already has leader for {} / {}", collection, s.getName());
+      LOG.debug("-- already has leader for {} / {}", collection, s.getName());
     }
     if (stateChanged.get() || saveState) {
       collectionsStatesRef.set(null);
@@ -678,6 +712,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
    * @param results results of the operation.
    */
   public void simCreateCollection(ZkNodeProps props, NamedList results) throws Exception {
+    ensureNotClosed();
     if (props.getStr(CommonAdminParams.ASYNC) != null) {
       results.add(CoreAdminParams.REQUESTID, props.getStr(CommonAdminParams.ASYNC));
     }
@@ -781,7 +816,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
         ReplicaInfo ri = new ReplicaInfo("core_node" + Assign.incAndGetId(stateManager, collectionName, 0),
             coreName, collectionName, pos.shard, pos.type, pos.node, replicaProps);
         cloudManager.submit(() -> {
-          simAddReplica(pos.node, ri, false);
+          simAddReplica(pos.node, ri, true);
           finalStateLatch.countDown();
           return true;
         });
@@ -814,7 +849,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
 
     // force recreation of collection states
     collectionsStatesRef.set(null);
-    simRunLeaderElection(Collections.singleton(collectionName), true);
+    //simRunLeaderElection(Collections.singleton(collectionName), true);
     if (waitForFinalState) {
       boolean finished = finalStateLatch.await(cloudManager.getTimeSource().convertDelay(TimeUnit.SECONDS, 60, TimeUnit.MILLISECONDS),
           TimeUnit.MILLISECONDS);
@@ -833,6 +868,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
    * @param results results of the operation
    */
   public void simDeleteCollection(String collection, String async, NamedList results) throws Exception {
+    ensureNotClosed();
     if (async != null) {
       results.add(CoreAdminParams.REQUESTID, async);
     }
@@ -903,6 +939,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
    * @param results operation results.
    */
   public void simMoveReplica(ZkNodeProps message, NamedList results) throws Exception {
+    ensureNotClosed();
     if (message.getStr(CommonAdminParams.ASYNC) != null) {
       results.add(CoreAdminParams.REQUESTID, message.getStr(CommonAdminParams.ASYNC));
     }
@@ -950,6 +987,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
    * @param results operation results
    */
   public void simCreateShard(ZkNodeProps message, NamedList results) throws Exception {
+    ensureNotClosed();
     if (message.getStr(CommonAdminParams.ASYNC) != null) {
       results.add(CoreAdminParams.REQUESTID, message.getStr(CommonAdminParams.ASYNC));
     }
@@ -1017,6 +1055,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
    * @param results operation results.
    */
   public void simSplitShard(ZkNodeProps message, NamedList results) throws Exception {
+    ensureNotClosed();
     if (message.getStr(CommonAdminParams.ASYNC) != null) {
       results.add(CoreAdminParams.REQUESTID, message.getStr(CommonAdminParams.ASYNC));
     }
@@ -1130,6 +1169,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
    * @param results operation results
    */
   public void simDeleteShard(ZkNodeProps message, NamedList results) throws Exception {
+    ensureNotClosed();
     if (message.getStr(CommonAdminParams.ASYNC) != null) {
       results.add(CoreAdminParams.REQUESTID, message.getStr(CommonAdminParams.ASYNC));
     }
@@ -1180,6 +1220,8 @@ public class SimClusterStateProvider implements ClusterStateProvider {
           CommonAdminParams.WAIT_FOR_FINAL_STATE, "true"
       );
       simCreateCollection(props, new NamedList());
+      CloudTestUtils.waitForState(cloudManager, CollectionAdminParams.SYSTEM_COLL, 20, TimeUnit.SECONDS,
+          CloudTestUtils.clusterShape(1, 1, false, true));
     } catch (Exception e) {
       throw new IOException(e);
     }
@@ -1204,6 +1246,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
    * @throws SolrException on errors, such as nonexistent collection or unsupported deleteByQuery
    */
   public UpdateResponse simUpdate(UpdateRequest req) throws SolrException, InterruptedException, IOException {
+    ensureNotClosed();
     String collection = req.getCollection();
     if (collection == null) {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Collection not set");
@@ -1699,6 +1742,17 @@ public class SimClusterStateProvider implements ClusterStateProvider {
 
   @Override
   public void close() throws IOException {
+    closed = true;
+  }
 
+  @Override
+  public boolean isClosed() {
+    return closed;
+  }
+
+  private void ensureNotClosed() throws IOException {
+    if (closed) {
+      throw new IOException("already closed");
+    }
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
index cb8640c..7a346ea 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
@@ -30,6 +30,8 @@ import java.util.TreeSet;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.locks.ReentrantLock;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 import org.apache.solr.client.solrj.cloud.NodeStateProvider;
@@ -230,6 +232,7 @@ public class SimNodeStateProvider implements NodeStateProvider {
     }
   }
 
+  private static final Pattern REGISTRY_PATTERN = Pattern.compile("^solr\\.core\\.([\\w.-_]+?)\\.(shard[\\d_]+?)\\.(replica.*)");
   /**
    * Simulate getting replica metrics values. This uses per-replica properties set in
    * {@link SimClusterStateProvider#simSetCollectionValue(String, String, Object, boolean, boolean)} and
@@ -257,14 +260,15 @@ public class SimNodeStateProvider implements NodeStateProvider {
         // skip - this is probably solr.node or solr.jvm metric
         continue;
       }
-      String[] collParts = parts[1].substring(10).split("\\.");
-      if (collParts.length != 3) {
+      Matcher m = REGISTRY_PATTERN.matcher(parts[1]);
+
+      if (!m.matches()) {
         LOG.warn("Invalid registry name: " + parts[1]);
         continue;
       }
-      String collection = collParts[0];
-      String shard = collParts[1];
-      String replica = collParts[2];
+      String collection = m.group(1);
+      String shard = m.group(2);
+      String replica = m.group(3);
       String key = parts.length > 3 ? parts[2] + ":" + parts[3] : parts[2];
       replicas.forEach(r -> {
         if (r.getCollection().equals(collection) && r.getShard().equals(shard) && r.getCore().endsWith(replica)) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java
index 9edf0ed..c1f3a56 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java
@@ -147,7 +147,7 @@ public class TestComputePlanAction extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     CloudTestUtils.waitForState(cluster, "Timed out waiting for replicas of new collection to be active",
-        "testNodeLost", CloudTestUtils.clusterShape(1, 2));
+        "testNodeLost", CloudTestUtils.clusterShape(1, 2, false, true));
 
     ClusterState clusterState = cluster.getClusterStateProvider().getClusterState();
     log.debug("-- cluster state: {}", clusterState);
@@ -210,7 +210,7 @@ public class TestComputePlanAction extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     CloudTestUtils.waitForState(cluster, "Timed out waiting for replicas of new collection to be active",
-        "testNodeWithMultipleReplicasLost", CloudTestUtils.clusterShape(2, 3));
+        "testNodeWithMultipleReplicasLost", CloudTestUtils.clusterShape(2, 3, false, true));
 
     ClusterState clusterState = cluster.getClusterStateProvider().getClusterState();
     log.debug("-- cluster state: {}", clusterState);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java
index 36d72b8..796dc3b 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java
@@ -90,7 +90,7 @@ public class TestExecutePlanAction extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     log.info("Collection ready after " + CloudTestUtils.waitForState(cluster, collectionName, 120, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(1, 2)) + "ms");
+        CloudTestUtils.clusterShape(1, 2, false, true)) + "ms");
 
     String sourceNodeName = cluster.getSimClusterStateProvider().simGetRandomNode(random());
     ClusterState clusterState = cluster.getClusterStateProvider().getClusterState();
@@ -152,7 +152,7 @@ public class TestExecutePlanAction extends SimSolrCloudTestCase {
     }
 
     log.info("Collection ready after " + CloudTestUtils.waitForState(cluster, collectionName, 300, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(1, 2)) + "ms");
+        CloudTestUtils.clusterShape(1, 2, false, true)) + "ms");
   }
 
   @Test
@@ -179,7 +179,7 @@ public class TestExecutePlanAction extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     CloudTestUtils.waitForState(cluster, "Timed out waiting for replicas of new collection to be active",
-        collectionName, CloudTestUtils.clusterShape(1, 2));
+        collectionName, CloudTestUtils.clusterShape(1, 2, false, true));
 
     String sourceNodeName = cluster.getSimClusterStateProvider().simGetRandomNode(random());
     ClusterState clusterState = cluster.getClusterStateProvider().getClusterState();
@@ -195,8 +195,10 @@ public class TestExecutePlanAction extends SimSolrCloudTestCase {
 
     cluster.simRemoveNode(sourceNodeName, false);
 
+    cluster.getTimeSource().sleep(3000);
+
     CloudTestUtils.waitForState(cluster, "Timed out waiting for replicas of collection to be 2 again",
-        collectionName, CloudTestUtils.clusterShape(1, 2));
+        collectionName, CloudTestUtils.clusterShape(1, 2, false, true));
 
     clusterState = cluster.getClusterStateProvider().getClusterState();
     docCollection = clusterState.getCollection(collectionName);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
index 61fedb8..93f92ec 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
@@ -87,6 +87,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
   static Map<String, List<CapturedEvent>> listenerEvents = new ConcurrentHashMap<>();
   static AtomicInteger triggerFinishedCount = new AtomicInteger();
   static AtomicInteger triggerStartedCount = new AtomicInteger();
+  static CountDownLatch triggerStartedLatch;
   static CountDownLatch triggerFinishedLatch;
   static int waitForSeconds;
 
@@ -100,6 +101,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     waitForSeconds = 5;
     triggerStartedCount.set(0);
     triggerFinishedCount.set(0);
+    triggerStartedLatch = new CountDownLatch(1);
     triggerFinishedLatch = new CountDownLatch(1);
     listenerEvents.clear();
     // disable .scheduled_maintenance and .auto_add_replicas
@@ -108,20 +110,33 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
         "}";
     SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, suspendTriggerCommand);
     SolrClient solrClient = cluster.simGetSolrClient();
-    NamedList<Object> response = solrClient.request(req);
-    assertEquals(response.get("result").toString(), "success");
+    NamedList<Object> response;
+    try {
+      response = solrClient.request(req);
+      assertEquals(response.get("result").toString(), "success");
+    } catch (Exception e) {
+      if (!e.toString().contains("No trigger exists")) {
+        throw e;
+      }
+    }
     suspendTriggerCommand = "{" +
         "'suspend-trigger' : {'name' : '.auto_add_replicas'}" +
         "}";
     req = createAutoScalingRequest(SolrRequest.METHOD.POST, suspendTriggerCommand);
-    response = solrClient.request(req);
-    assertEquals(response.get("result").toString(), "success");
+    try {
+      response = solrClient.request(req);
+      assertEquals(response.get("result").toString(), "success");
+    } catch (Exception e) {
+      if (!e.toString().contains("No trigger exists")) {
+        throw e;
+      }
+    }
 
     // do this in advance if missing
     if (!cluster.getSimClusterStateProvider().simListCollections().contains(CollectionAdminParams.SYSTEM_COLL)) {
       cluster.getSimClusterStateProvider().createSystemCollection();
       CloudTestUtils.waitForState(cluster, CollectionAdminParams.SYSTEM_COLL, 120, TimeUnit.SECONDS,
-          CloudTestUtils.clusterShape(1, 1));
+          CloudTestUtils.clusterShape(1, 1, false, true));
     }
   }
 
@@ -150,6 +165,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
   public static class StartTriggerAction extends TriggerActionBase {
     @Override
     public void process(TriggerEvent event, ActionContext context) throws Exception {
+      triggerStartedLatch.countDown();
       triggerStartedCount.incrementAndGet();
     }
   }
@@ -212,7 +228,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(5, 15)) + "ms");
+        CloudTestUtils.clusterShape(5, 15, false, true)) + "ms");
 
     int KILL_NODES = 8;
     // kill off a number of nodes
@@ -221,7 +237,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     }
     // should fully recover
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 90 * KILL_NODES, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(5, 15)) + "ms");
+        CloudTestUtils.clusterShape(5, 15, false, true)) + "ms");
 
     log.info("OP COUNTS: " + cluster.simGetOpCounts());
     long moveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name());
@@ -256,7 +272,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
 
 
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(5, 15)) + "ms");
+        CloudTestUtils.clusterShape(5, 15, false, true)) + "ms");
     long newMoveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name());
     log.info("==== Flaky replicas: {}. Additional MOVEREPLICA count: {}", flakyReplicas, (newMoveReplicaOps - moveReplicaOps));
     // flaky nodes lead to a number of MOVEREPLICA that is non-zero but lower than the number of flaky replicas
@@ -295,7 +311,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3)) + " ms");
+        CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3, false, true)) + " ms");
 
     // start adding nodes
     int numAddNode = NUM_NODES / 5;
@@ -305,7 +321,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
       cluster.getTimeSource().sleep(5000);
     }
     // wait until at least one event is generated
-    boolean await = triggerFinishedLatch.await(10000 / SPEED, TimeUnit.MILLISECONDS);
+    boolean await = triggerStartedLatch.await(20000 / SPEED, TimeUnit.MILLISECONDS);
     assertTrue("trigger did not fire", await);
 
     // wait until started == finished
@@ -340,7 +356,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     assertTrue("no MOVEREPLICA ops?", cluster.simGetOpCount("MOVEREPLICA") > 0);
 
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3)) + " ms");
+        CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3, false, true)) + " ms");
 
     int count = 50;
     SolrInputDocument finishedEvent = null;
@@ -471,6 +487,20 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     NamedList<Object> response = solrClient.request(req);
     assertEquals(response.get("result").toString(), "success");
 
+    String setListenerCommand = "{" +
+        "'set-listener' : " +
+        "{" +
+        "'name' : 'failures'," +
+        "'trigger' : 'node_lost_trigger3'," +
+        "'stage' : ['FAILED']," +
+        "'class' : '" + TestTriggerListener.class.getName() + "'" +
+        "}" +
+        "}";
+    req = createAutoScalingRequest(SolrRequest.METHOD.POST, setListenerCommand);
+    response = solrClient.request(req);
+    assertEquals(response.get("result").toString(), "success");
+
+
     // create a collection with 1 replica per node
     String collectionName = "testNodeLost";
     CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionName,
@@ -480,7 +510,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(NUM_NODES / 5, NUM_NODES / 10)) + " ms");
+        CloudTestUtils.clusterShape(NUM_NODES / 5, NUM_NODES / 10, false, true)) + " ms");
 
     // start killing nodes
     int numNodes = NUM_NODES / 5;
@@ -491,7 +521,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
       cluster.simRemoveNode(nodes.get(i), false);
       cluster.getTimeSource().sleep(killDelay);
     }
-    // wait for the trigger to fire at least once
+    // wait for the trigger to fire and complete at least once
     boolean await = triggerFinishedLatch.await(20 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS);
     assertTrue("trigger did not fire within timeout, " +
         "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
@@ -513,16 +543,23 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
             "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
           startedEventPos > -1);
     SolrInputDocument startedEvent = systemColl.get(startedEventPos);
-    // wait until started == finished
+    // we can expect some failures when target node in MOVEREPLICA has been killed
+    // between when the event processing started and the actual moment of MOVEREPLICA execution
+    // wait until started == (finished + failed)
     TimeOut timeOut = new TimeOut(20 * waitFor * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource());
     while (!timeOut.hasTimedOut()) {
       if (triggerStartedCount.get() == triggerFinishedCount.get()) {
         break;
       }
+      log.debug("started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get() +
+          ", failed=" + listenerEvents.size());
       timeOut.sleep(1000);
     }
     if (timeOut.hasTimedOut()) {
-      fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get());
+      if (triggerStartedCount.get() > triggerFinishedCount.get() + listenerEvents.size()) {
+        fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get() +
+            ", failed=" + listenerEvents.size());
+      }
     }
     int ignored = 0;
     int lastIgnoredPos = startedEventPos;
@@ -546,8 +583,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
             "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
             cluster.simGetOpCount("MOVEREPLICA") > 0);
 
-    log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(NUM_NODES / 5, NUM_NODES / 10)) + " ms");
+    if (listenerEvents.isEmpty()) {
+      // no failed movements - verify collection shape
+      log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
+          CloudTestUtils.clusterShape(NUM_NODES / 5, NUM_NODES / 10, false, true)) + " ms");
+    } else {
+      cluster.getTimeSource().sleep(NUM_NODES * 100);
+    }
 
     int count = 50;
     SolrInputDocument finishedEvent = null;
@@ -580,9 +622,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     delta = TimeUnit.NANOSECONDS.toMillis(delta);
     log.info("#### System stabilized after " + delta + " ms");
     long ops = cluster.simGetOpCount("MOVEREPLICA");
-    assertTrue("unexpected number of MOVEREPLICA ops: " + ops + ", " +
+    long expectedMinOps = 40;
+    if (!listenerEvents.isEmpty()) {
+      expectedMinOps = 20;
+    }
+    assertTrue("unexpected number (" + expectedMinOps + ") of MOVEREPLICA ops: " + ops + ", " +
             "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
-            ops >= 40);
+            ops >= expectedMinOps);
     return delta;
   }
 
@@ -596,7 +642,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 300, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(2, 10)) + " ms");
+        CloudTestUtils.clusterShape(2, 10, false, true)) + " ms");
 
     // collect the node names for shard1
     Set<String> nodes = new HashSet<>();
@@ -617,6 +663,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
         "'event' : 'searchRate'," +
         "'waitFor' : '" + waitForSeconds + "s'," +
         "'aboveRate' : 1.0," +
+        "'aboveNodeRate' : 1.0," +
         "'enabled' : true," +
         "'actions' : [" +
         "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
@@ -655,7 +702,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     m.forEach((k, v) -> assertEquals(4.0, v.doubleValue(), 0.01));
     List<TriggerEvent.Op> ops = (List<TriggerEvent.Op>)ev.event.getProperty(TriggerEvent.REQUESTED_OPS);
     assertNotNull(ops);
-    assertEquals(3, ops.size());
+    assertEquals(ops.toString(), 1, ops.size());
     ops.forEach(op -> {
       assertEquals(CollectionParams.CollectionAction.ADDREPLICA, op.getAction());
       assertEquals(1, op.getHints().size());

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java
index 1adbabb..3237639 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java
@@ -69,7 +69,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
         .process(solrClient);
 
     CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", "perReplicaDataColl",
-        CloudTestUtils.clusterShape(1, 5));
+        CloudTestUtils.clusterShape(1, 5, false, true));
     DocCollection coll = getCollectionState("perReplicaDataColl");
     String autoScaleJson = "{" +
         "  'cluster-preferences': [" +
@@ -119,7 +119,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
         .setPolicy("c1")
         .process(solrClient);
     CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", collectionName,
-        CloudTestUtils.clusterShape(1, 1));
+        CloudTestUtils.clusterShape(1, 1, false, true));
 
     getCollectionState(collectionName).forEachReplica((s, replica) -> assertEquals(nodeId, replica.getNodeName()));
 
@@ -152,7 +152,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
         .setPolicy("c1")
         .process(solrClient);
     CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", collectionName,
-        CloudTestUtils.clusterShape(1, 2));
+        CloudTestUtils.clusterShape(1, 2, false, true));
 
     DocCollection docCollection = getCollectionState(collectionName);
     List<Replica> list = docCollection.getReplicas(firstNode);
@@ -257,7 +257,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
     CollectionAdminRequest.createCollectionWithImplicitRouter("policiesTest", "conf", "s1", 1, 1, 1)
         .process(solrClient);
     CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", "policiesTest",
-        CloudTestUtils.clusterShape(1, 3));
+        CloudTestUtils.clusterShape(1, 3, false, true));
 
     DocCollection coll = getCollectionState("policiesTest");
 
@@ -322,7 +322,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
     CollectionAdminRequest.createCollectionWithImplicitRouter("policiesTest", "conf", "shard1", 2)
         .process(solrClient);
     CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", "policiesTest",
-        CloudTestUtils.clusterShape(1, 2));
+        CloudTestUtils.clusterShape(1, 2, false, true));
     DocCollection rulesCollection = getCollectionState("policiesTest");
 
     Map<String, Object> val = cluster.getNodeStateProvider().getNodeValues(rulesCollection.getReplicas().get(0).getNodeName(), Arrays.asList(