You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2018/08/09 19:26:30 UTC

lucene-solr:jira/solr-12470: SOLR-12470: Fix several bugs in rate calculation and strategies for calculating ops based on hot/cold shards and nodes. Fix some issues in the simulator.

Repository: lucene-solr
Updated Branches:
  refs/heads/jira/solr-12470 [created] d8fb69166


SOLR-12470: Fix several bugs in rate calculation and strategies for calculating
ops based on hot/cold shards and nodes. Fix some issues in the simulator.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/d8fb6916
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/d8fb6916
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/d8fb6916

Branch: refs/heads/jira/solr-12470
Commit: d8fb6916628626355fab9162f747ed19360acbf5
Parents: 3898653
Author: Andrzej Bialecki <ab...@apache.org>
Authored: Thu Aug 9 21:25:18 2018 +0200
Committer: Andrzej Bialecki <ab...@apache.org>
Committed: Thu Aug 9 21:25:18 2018 +0200

----------------------------------------------------------------------
 .../cloud/autoscaling/ScheduledTriggers.java    |   7 +-
 .../cloud/autoscaling/SearchRateTrigger.java    | 157 +++++++++++++------
 .../autoscaling/TriggerActionException.java     |  33 ++++
 .../org/apache/solr/cloud/CloudTestUtils.java   |  23 ++-
 .../cloud/autoscaling/IndexSizeTriggerTest.java |   4 +-
 .../ScheduledMaintenanceTriggerTest.java        |   2 +-
 .../SearchRateTriggerIntegrationTest.java       |  23 ++-
 .../autoscaling/SearchRateTriggerTest.java      |  11 +-
 .../sim/SimClusterStateProvider.java            | 110 +++++++------
 .../autoscaling/sim/SimNodeStateProvider.java   |  14 +-
 .../autoscaling/sim/TestComputePlanAction.java  |   4 +-
 .../autoscaling/sim/TestExecutePlanAction.java  |  10 +-
 .../cloud/autoscaling/sim/TestLargeCluster.java |  51 ++++--
 .../cloud/autoscaling/sim/TestPolicyCloud.java  |  10 +-
 .../autoscaling/sim/TestTriggerIntegration.java |   3 +-
 .../src/solrcloud-autoscaling-triggers.adoc     |  38 ++++-
 16 files changed, 350 insertions(+), 150 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
index c125209..7c3cbb0 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
@@ -308,7 +308,7 @@ public class ScheduledTriggers implements Closeable {
                   action.process(event, actionContext);
                 } catch (Exception e) {
                   triggerListeners1.fireListeners(event.getSource(), event, TriggerEventProcessorStage.FAILED, action.getName(), actionContext, e, null);
-                  throw new Exception("Error executing action: " + action.getName() + " for trigger event: " + event, e);
+                  throw new TriggerActionException(event.getSource(), action.getName(), "Error processing action for trigger event: " + event, e);
                 }
                 List<String> afterActions = (List<String>) actionContext.getProperties().computeIfAbsent(TriggerEventProcessorStage.AFTER_ACTION.toString(), k -> new ArrayList<String>());
                 afterActions.add(action.getName());
@@ -319,8 +319,11 @@ public class ScheduledTriggers implements Closeable {
                 assert ev.getId().equals(event.getId());
               }
               triggerListeners1.fireListeners(event.getSource(), event, TriggerEventProcessorStage.SUCCEEDED);
-            } catch (Exception e) {
+            } catch (TriggerActionException e) {
               log.warn("Exception executing actions", e);
+            } catch (Exception e) {
+              triggerListeners1.fireListeners(event.getSource(), event, TriggerEventProcessorStage.FAILED);
+              log.warn("Unhandled exception executing actions", e);
             } finally {
               cooldownStart.set(cloudManager.getTimeSource().getTimeNs());
               hasPendingActions.set(false);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
index 1824f7f..a875674 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
@@ -63,6 +63,8 @@ public class SearchRateTrigger extends TriggerBase {
   public static final String MIN_REPLICAS_PROP = "minReplicas";
   public static final String ABOVE_RATE_PROP = "aboveRate";
   public static final String BELOW_RATE_PROP = "belowRate";
+  public static final String ABOVE_NODE_RATE_PROP = "aboveNodeRate";
+  public static final String BELOW_NODE_RATE_PROP = "belowNodeRate";
   public static final String ABOVE_OP_PROP = "aboveOp";
   public static final String BELOW_OP_PROP = "belowOp";
   public static final String ABOVE_NODE_OP_PROP = "aboveNodeOp";
@@ -81,6 +83,7 @@ public class SearchRateTrigger extends TriggerBase {
   public static final String COLD_COLLECTIONS = "coldCollections";
   public static final String COLD_SHARDS = "coldShards";
   public static final String COLD_REPLICAS = "coldReplicas";
+  public static final String VIOLATION_PROP = "violationType";
 
   public static final int DEFAULT_MAX_OPS = 3;
   public static final String DEFAULT_METRIC = "QUERY./select.requestTimes:1minRate";
@@ -93,6 +96,8 @@ public class SearchRateTrigger extends TriggerBase {
   private String node;
   private double aboveRate;
   private double belowRate;
+  private double aboveNodeRate;
+  private double belowNodeRate;
   private CollectionParams.CollectionAction aboveOp, belowOp, aboveNodeOp, belowNodeOp;
   private final Map<String, Long> lastCollectionEvent = new ConcurrentHashMap<>();
   private final Map<String, Long> lastNodeEvent = new ConcurrentHashMap<>();
@@ -117,6 +122,8 @@ public class SearchRateTrigger extends TriggerBase {
         BELOW_NODE_OP_PROP,
         ABOVE_RATE_PROP,
         BELOW_RATE_PROP,
+        ABOVE_NODE_RATE_PROP,
+        BELOW_NODE_RATE_PROP,
         // back-compat props
         BC_COLLECTION_PROP,
         BC_RATE_PROP);
@@ -192,6 +199,28 @@ public class SearchRateTrigger extends TriggerBase {
       belowRate = -1;
     }
 
+    // node rates
+    above = properties.get(ABOVE_NODE_RATE_PROP);
+    below = properties.get(BELOW_NODE_RATE_PROP);
+    if (above != null) {
+      try {
+        aboveNodeRate = Double.parseDouble(String.valueOf(above));
+      } catch (Exception e) {
+        throw new TriggerValidationException(name, ABOVE_NODE_RATE_PROP, "Invalid configuration value: '" + above + "': " + e.toString());
+      }
+    } else {
+      aboveNodeRate = Double.MAX_VALUE;
+    }
+    if (below != null) {
+      try {
+        belowNodeRate = Double.parseDouble(String.valueOf(below));
+      } catch (Exception e) {
+        throw new TriggerValidationException(name, BELOW_NODE_RATE_PROP, "Invalid configuration value: '" + below + "': " + e.toString());
+      }
+    } else {
+      belowNodeRate = -1;
+    }
+
     String aboveOpStr = String.valueOf(properties.getOrDefault(ABOVE_OP_PROP, CollectionParams.CollectionAction.ADDREPLICA.toLower()));
     String belowOpStr = String.valueOf(properties.getOrDefault(BELOW_OP_PROP, CollectionParams.CollectionAction.DELETEREPLICA.toLower()));
     aboveOp = CollectionParams.CollectionAction.get(aboveOpStr);
@@ -231,6 +260,8 @@ public class SearchRateTrigger extends TriggerBase {
     config.put(MIN_REPLICAS_PROP, minReplicas);
     config.put(ABOVE_RATE_PROP, aboveRate);
     config.put(BELOW_RATE_PROP, belowRate);
+    config.put(ABOVE_NODE_RATE_PROP, aboveNodeRate);
+    config.put(BELOW_NODE_RATE_PROP, belowNodeRate);
     config.put(ABOVE_OP_PROP, aboveOp);
     config.put(ABOVE_NODE_OP_PROP, aboveNodeOp);
     config.put(BELOW_OP_PROP, belowOp);
@@ -358,15 +389,16 @@ public class SearchRateTrigger extends TriggerBase {
     long now = cloudManager.getTimeSource().getTimeNs();
     Map<String, Double> hotNodes = new HashMap<>();
     Map<String, Double> coldNodes = new HashMap<>();
+
     // check for exceeded rates and filter out those with less than waitFor from previous events
     nodeRates.entrySet().stream()
         .filter(entry -> node.equals(Policy.ANY) || node.equals(entry.getKey()))
         .forEach(entry -> {
-          if (entry.getValue().get() > aboveRate) {
+          if (entry.getValue().get() > aboveNodeRate) {
             if (waitForElapsed(entry.getKey(), now, lastNodeEvent)) {
               hotNodes.put(entry.getKey(), entry.getValue().get());
             }
-          } else if (entry.getValue().get() < belowRate) {
+          } else if (entry.getValue().get() < belowNodeRate) {
             if (waitForElapsed(entry.getKey(), now, lastNodeEvent)) {
               coldNodes.put(entry.getKey(), entry.getValue().get());
             }
@@ -401,6 +433,8 @@ public class SearchRateTrigger extends TriggerBase {
               return r;
             })
             .mapToDouble(r -> (Double)r.getVariable(AutoScalingParams.RATE)).sum();
+        // calculate average shard rate over all searchable replicas (see SOLR-12470)
+        shardRate = shardRate / searchableReplicationFactors.get(coll).get(sh).doubleValue();
         String elapsedKey = coll + "." + sh;
         if ((collections.isEmpty() || collections.contains(coll)) &&
             (shard.equals(Policy.ANY) || shard.equals(sh))) {
@@ -411,6 +445,13 @@ public class SearchRateTrigger extends TriggerBase {
           } else if (shardRate < belowRate) {
             if (waitForElapsed(elapsedKey, now, lastShardEvent)) {
               coldShards.computeIfAbsent(coll, s -> new HashMap<>()).put(sh, shardRate);
+            } else {
+              if (log.isDebugEnabled()) {
+                Long lastTime = lastShardEvent.computeIfAbsent(elapsedKey, s -> now);
+                long elapsed = TimeUnit.SECONDS.convert(now - lastTime, TimeUnit.NANOSECONDS);
+                log.debug("-- waitFor didn't elapse for " + elapsedKey + ", waitFor=" + getWaitForSecond() +
+                    ", elapsed=" + elapsed);
+              }
             }
           } else {
             // no violation - clear waitForElapsed
@@ -511,9 +552,10 @@ public class SearchRateTrigger extends TriggerBase {
     });
 
     final List<TriggerEvent.Op> ops = new ArrayList<>();
+    final Set<String> violations = new HashSet<>();
 
-    calculateHotOps(ops, searchableReplicationFactors, hotNodes, hotCollections, hotShards, hotReplicas);
-    calculateColdOps(ops, clusterState, searchableReplicationFactors, coldNodes, coldCollections, coldShards, coldReplicas);
+    calculateHotOps(ops, violations, searchableReplicationFactors, hotNodes, hotCollections, hotShards, hotReplicas);
+    calculateColdOps(ops, violations, clusterState, searchableReplicationFactors, coldNodes, coldCollections, coldShards, coldReplicas);
 
     if (ops.isEmpty()) {
       return;
@@ -521,7 +563,7 @@ public class SearchRateTrigger extends TriggerBase {
 
     if (processor.process(new SearchRateEvent(getName(), eventTime.get(), ops,
         hotNodes, hotCollections, hotShards, hotReplicas,
-        coldNodes, coldCollections, coldShards, coldReplicas))) {
+        coldNodes, coldCollections, coldShards, coldReplicas, violations))) {
       // update lastEvent times
       hotNodes.keySet().forEach(node -> lastNodeEvent.put(node, now));
       coldNodes.keySet().forEach(node -> lastNodeEvent.put(node, now));
@@ -537,6 +579,7 @@ public class SearchRateTrigger extends TriggerBase {
   }
 
   private void calculateHotOps(List<TriggerEvent.Op> ops,
+                               Set<String> violations,
                                Map<String, Map<String, AtomicInteger>> searchableReplicationFactors,
                                Map<String, Double> hotNodes,
                                Map<String, Double> hotCollections,
@@ -545,40 +588,47 @@ public class SearchRateTrigger extends TriggerBase {
     // calculate the number of replicas to add to each hot shard, based on how much the rate was
     // exceeded - but within limits.
 
-    // first resolve a situation when only a node is hot but no collection / shard / replica is hot
+    // first resolve a situation when only a node is hot but no collection / shard is hot
     // TODO: eventually we may want to commission a new node
-    if (!hotNodes.isEmpty() && hotShards.isEmpty() && hotCollections.isEmpty() && hotReplicas.isEmpty()) {
-      // move replicas around
-      if (aboveNodeOp != null) {
-        hotNodes.forEach((n, r) -> {
-          ops.add(new TriggerEvent.Op(aboveNodeOp, Suggester.Hint.SRC_NODE, n));
-        });
-      }
-    } else {
-      // add replicas
-      Map<String, Map<String, List<Pair<String, String>>>> hints = new HashMap<>();
-
-      hotShards.forEach((coll, shards) -> shards.forEach((s, r) -> {
-        List<Pair<String, String>> perShard = hints
-            .computeIfAbsent(coll, c -> new HashMap<>())
-            .computeIfAbsent(s, sh -> new ArrayList<>());
-        addReplicaHints(coll, s, r, searchableReplicationFactors.get(coll).get(s).get(), perShard);
-      }));
-      hotReplicas.forEach(ri -> {
-        double r = (Double)ri.getVariable(AutoScalingParams.RATE);
-        // add only if not already accounted for in hotShards
-        List<Pair<String, String>> perShard = hints
-            .computeIfAbsent(ri.getCollection(), c -> new HashMap<>())
-            .computeIfAbsent(ri.getShard(), sh -> new ArrayList<>());
-        if (perShard.isEmpty()) {
-          addReplicaHints(ri.getCollection(), ri.getShard(), r, searchableReplicationFactors.get(ri.getCollection()).get(ri.getShard()).get(), perShard);
+    if (!hotNodes.isEmpty()) {
+      if (hotShards.isEmpty() && hotCollections.isEmpty()) {
+        // move replicas around
+        if (aboveNodeOp != null) {
+          hotNodes.forEach((n, r) -> {
+            ops.add(new TriggerEvent.Op(aboveNodeOp, Suggester.Hint.SRC_NODE, n));
+            violations.add(HOT_NODES);
+          });
         }
-      });
-
-      hints.values().forEach(m -> m.values().forEach(lst -> lst.forEach(p -> {
-        ops.add(new TriggerEvent.Op(aboveOp, Suggester.Hint.COLL_SHARD, p));
-      })));
+      } else {
+        // ignore - hot shards will result in changes that will change hot node status anyway
+      }
     }
+    // add replicas
+    Map<String, Map<String, List<Pair<String, String>>>> hints = new HashMap<>();
+
+    // HOT COLLECTIONS
+    // currently we don't do anything for hot collections. Theoretically we could add
+    // 1 replica more to each shard, based on how close to the threshold each shard is
+    // but it's probably better to wait for a shard to become hot and be more precise.
+
+    // HOT SHARDS
+
+    hotShards.forEach((coll, shards) -> shards.forEach((s, r) -> {
+      List<Pair<String, String>> perShard = hints
+          .computeIfAbsent(coll, c -> new HashMap<>())
+          .computeIfAbsent(s, sh -> new ArrayList<>());
+      addReplicaHints(coll, s, r, searchableReplicationFactors.get(coll).get(s).get(), perShard);
+      violations.add(HOT_SHARDS);
+    }));
+
+    // HOT REPLICAS
+    // Hot replicas (while their shards are not hot) may be caused by
+    // dumb clients that use direct replica URLs - this is beyond our control
+    // so ignore them.
+
+    hints.values().forEach(m -> m.values().forEach(lst -> lst.forEach(p -> {
+      ops.add(new TriggerEvent.Op(aboveOp, Suggester.Hint.COLL_SHARD, p));
+    })));
 
   }
 
@@ -601,6 +651,7 @@ public class SearchRateTrigger extends TriggerBase {
   }
 
   private void calculateColdOps(List<TriggerEvent.Op> ops,
+                                Set<String> violations,
                                 ClusterState clusterState,
                                 Map<String, Map<String, AtomicInteger>> searchableReplicationFactors,
                                 Map<String, Double> coldNodes,
@@ -611,12 +662,13 @@ public class SearchRateTrigger extends TriggerBase {
     // Probably can't do anything reasonable about whole cold collections
     // because they may be needed even if not used.
 
-    // COLD SHARDS:
-    // Cold shards mean that there are too many replicas per shard - but it also
-    // means that all replicas in these shards are cold too, so we can simply
-    // address this by deleting cold replicas
+    // COLD SHARDS & COLD REPLICAS:
+    // We remove cold replicas only from cold shards, otherwise we are susceptible to uneven
+    // replica routing (which is beyond our control).
+    // If we removed replicas from non-cold shards we could accidentally bring that shard into
+    // the hot range, which would result in adding replica, and that replica could again stay cold due to
+    // the same routing issue, which then would lead to removing that replica, etc, etc...
 
-    // COLD REPLICAS:
     // Remove cold replicas but only when there's at least a minimum number of searchable
     // replicas still available (additional non-searchable replicas may exist, too)
     // NOTE: do this before adding ops for DELETENODE because we don't want to attempt
@@ -627,11 +679,18 @@ public class SearchRateTrigger extends TriggerBase {
           .computeIfAbsent(ri.getShard(), s -> new ArrayList<>())
           .add(ri);
     });
-    byCollectionByShard.forEach((coll, shards) -> {
-      shards.forEach((shard, replicas) -> {
+    coldShards.forEach((coll, perShard) -> {
+      perShard.forEach((shard, rate) -> {
+        List<ReplicaInfo> replicas = byCollectionByShard
+            .getOrDefault(coll, Collections.emptyMap())
+            .getOrDefault(shard, Collections.emptyList());
+        if (replicas.isEmpty()) {
+          return;
+        }
         // only delete if there's at least minRF searchable replicas left
         int rf = searchableReplicationFactors.get(coll).get(shard).get();
-        // we only really need a leader and we may be allowed to remove other replicas
+        // assume first that we only really need a leader and we may be
+        // allowed to remove other replicas
         int minRF = 1;
         // but check the official RF and don't go below that
         Integer RF = clusterState.getCollection(coll).getReplicationFactor();
@@ -660,6 +719,7 @@ public class SearchRateTrigger extends TriggerBase {
                 Suggester.Hint.COLL_SHARD, new Pair<>(ri.getCollection(), ri.getShard()));
             op.addHint(Suggester.Hint.REPLICA, ri.getName());
             ops.add(op);
+            violations.add(COLD_SHARDS);
             limit.decrementAndGet();
           });
         }
@@ -669,7 +729,7 @@ public class SearchRateTrigger extends TriggerBase {
     // COLD NODES:
     // Unlike the case of hot nodes, if a node is cold then any monitored
     // collections / shards / replicas located on that node are cold, too.
-    // HOWEVER, we check only non-pull replicas and only from selected collections / shards,
+    // HOWEVER, we check only replicas from selected collections / shards,
     // so deleting a cold node is dangerous because it may interfere with these
     // non-monitored resources - this is the reason the default belowNodeOp is null / ignored.
     //
@@ -679,6 +739,7 @@ public class SearchRateTrigger extends TriggerBase {
     if (belowNodeOp != null) {
       coldNodes.forEach((node, rate) -> {
         ops.add(new TriggerEvent.Op(belowNodeOp, Suggester.Hint.SRC_NODE, node));
+        violations.add(COLD_NODES);
       });
     }
 
@@ -688,7 +749,7 @@ public class SearchRateTrigger extends TriggerBase {
   private boolean waitForElapsed(String name, long now, Map<String, Long> lastEventMap) {
     Long lastTime = lastEventMap.computeIfAbsent(name, s -> now);
     long elapsed = TimeUnit.SECONDS.convert(now - lastTime, TimeUnit.NANOSECONDS);
-    log.trace("name={}, lastTime={}, elapsed={}", name, lastTime, elapsed);
+    log.trace("name={}, lastTime={}, elapsed={}, waitFor={}", name, lastTime, elapsed, getWaitForSecond());
     if (TimeUnit.SECONDS.convert(now - lastTime, TimeUnit.NANOSECONDS) < getWaitForSecond()) {
       return false;
     }
@@ -704,7 +765,8 @@ public class SearchRateTrigger extends TriggerBase {
                            Map<String, Double> coldNodes,
                            Map<String, Double> coldCollections,
                            Map<String, Map<String, Double>> coldShards,
-                           List<ReplicaInfo> coldReplicas) {
+                           List<ReplicaInfo> coldReplicas,
+                           Set<String> violations) {
       super(TriggerEventType.SEARCHRATE, source, eventTime, null);
       properties.put(TriggerEvent.REQUESTED_OPS, ops);
       properties.put(HOT_NODES, hotNodes);
@@ -715,6 +777,7 @@ public class SearchRateTrigger extends TriggerBase {
       properties.put(COLD_COLLECTIONS, coldCollections);
       properties.put(COLD_SHARDS, coldShards);
       properties.put(COLD_REPLICAS, coldReplicas);
+      properties.put(VIOLATION_PROP, violations);
     }
   }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionException.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionException.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionException.java
new file mode 100644
index 0000000..624ce68
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionException.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.cloud.autoscaling;
+
+/**
+ * Trigger action-specific exception.
+ */
+public class TriggerActionException extends Exception {
+
+  public final String triggerName;
+  public final String actionName;
+
+  public TriggerActionException(String triggerName, String actionName, String message, Throwable cause) {
+    super(message, cause);
+    this.triggerName = triggerName;
+    this.actionName = actionName;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java b/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java
index 448889f..71e6ef2 100644
--- a/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java
+++ b/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java
@@ -20,6 +20,7 @@ package org.apache.solr.cloud;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.Collection;
+import java.util.HashSet;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
@@ -118,7 +119,7 @@ public class CloudTestUtils {
    * @param expectedReplicas expected number of active replicas
    */
   public static CollectionStatePredicate clusterShape(int expectedShards, int expectedReplicas) {
-    return clusterShape(expectedShards, expectedReplicas, false);
+    return clusterShape(expectedShards, expectedReplicas, false, false);
   }
 
   /**
@@ -129,8 +130,10 @@ public class CloudTestUtils {
    * @param expectedShards expected number of shards
    * @param expectedReplicas expected number of active replicas
    * @param withInactive if true then count also inactive shards
+   * @param requireLeaders if true then require that each shard has a leader
    */
-  public static CollectionStatePredicate clusterShape(int expectedShards, int expectedReplicas, boolean withInactive) {
+  public static CollectionStatePredicate clusterShape(int expectedShards, int expectedReplicas, boolean withInactive,
+                                                      boolean requireLeaders) {
     return (liveNodes, collectionState) -> {
       if (collectionState == null) {
         log.debug("-- null collection");
@@ -141,8 +144,17 @@ public class CloudTestUtils {
         log.debug("-- wrong number of active slices, expected=" + expectedShards + ", found=" + collectionState.getSlices().size());
         return false;
       }
+      Set<String> leaderless = new HashSet<>();
       for (Slice slice : slices) {
         int activeReplicas = 0;
+        if (requireLeaders && slice.getLeader() == null) {
+          leaderless.add(slice.getName());
+          continue;
+        }
+        // skip other checks, we're going to fail anyway
+        if (!leaderless.isEmpty()) {
+          continue;
+        }
         for (Replica replica : slice) {
           if (replica.isActive(liveNodes))
             activeReplicas++;
@@ -152,7 +164,12 @@ public class CloudTestUtils {
           return false;
         }
       }
-      return true;
+      if (leaderless.isEmpty()) {
+        return true;
+      } else {
+        log.debug("-- shards without leaders: " + leaderless);
+        return false;
+      }
     };
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java
index daf9867..4c0a17c 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java
@@ -310,7 +310,7 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
 
     boolean await = finished.await(60000 / SPEED, TimeUnit.MILLISECONDS);
     assertTrue("did not finish processing in time", await);
-    CloudTestUtils.waitForState(cloudManager, collectionName, 20, TimeUnit.SECONDS, CloudTestUtils.clusterShape(6, 2, true));
+    CloudTestUtils.waitForState(cloudManager, collectionName, 20, TimeUnit.SECONDS, CloudTestUtils.clusterShape(6, 2, true, true));
     assertEquals(1, listenerEvents.size());
     List<CapturedEvent> events = listenerEvents.get("capturing");
     assertNotNull("'capturing' events not found", events);
@@ -583,7 +583,7 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
 
     // collection should have 2 inactive and 4 active shards
     CloudTestUtils.waitForState(cloudManager, "failed to create " + collectionName, collectionName,
-        CloudTestUtils.clusterShape(6, 2, true));
+        CloudTestUtils.clusterShape(6, 2, true, true));
 
     // check ops
     List<TriggerEvent.Op> ops = (List<TriggerEvent.Op>) events.get(4).event.getProperty(TriggerEvent.REQUESTED_OPS);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java
index 164db8f..56896a7 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java
@@ -165,7 +165,7 @@ public class ScheduledMaintenanceTriggerTest extends SolrCloudTestCase {
         .setShardName("shard1");
     split1.process(solrClient);
     CloudTestUtils.waitForState(cloudManager, "failed to split " + collection1, collection1,
-        CloudTestUtils.clusterShape(3, 1, true));
+        CloudTestUtils.clusterShape(3, 1, true, true));
 
     String setListenerCommand = "{" +
         "'set-listener' : " +

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java
index f8492d9..476d78a 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java
@@ -235,9 +235,8 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
     assertTrue(TimeUnit.SECONDS.convert(waitForSeconds, TimeUnit.NANOSECONDS) - WAIT_FOR_DELTA_NANOS <= now - ev.event.getEventTime());
     Map<String, Double> nodeRates = (Map<String, Double>) ev.event.getProperties().get(SearchRateTrigger.HOT_NODES);
     assertNotNull("nodeRates", nodeRates);
-    assertTrue(nodeRates.toString(), nodeRates.size() > 0);
-    AtomicDouble totalNodeRate = new AtomicDouble();
-    nodeRates.forEach((n, r) -> totalNodeRate.addAndGet(r));
+    // no node violations because node rates weren't set in the config
+    assertTrue(nodeRates.toString(), nodeRates.isEmpty());
     List<ReplicaInfo> replicaRates = (List<ReplicaInfo>) ev.event.getProperties().get(SearchRateTrigger.HOT_REPLICAS);
     assertNotNull("replicaRates", replicaRates);
     assertTrue(replicaRates.toString(), replicaRates.size() > 0);
@@ -260,8 +259,8 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
     Double collectionRate = collectionRates.get(COLL1);
     assertNotNull(collectionRate);
     assertTrue(collectionRate > 5.0);
-    assertEquals(collectionRate, totalNodeRate.get(), 5.0);
-    assertEquals(collectionRate, totalShardRate.get(), 5.0);
+    // two replicas - the trigger calculates average over all searchable replicas
+    assertEquals(collectionRate / 2, totalShardRate.get(), 5.0);
     assertEquals(collectionRate, totalReplicaRate.get(), 5.0);
 
     // check operations
@@ -303,6 +302,8 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
         "'collections' : '" + COLL1 + "'," +
         "'aboveRate' : 1.0," +
         "'belowRate' : 0.1," +
+        "'aboveNodeRate' : 1.0," +
+        "'belowNodeRate' : 0.1," +
         // do nothing but generate an op
         "'belowNodeOp' : 'none'," +
         "'actions' : [" +
@@ -383,7 +384,7 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
     response = solrClient.request(req);
     assertEquals(response.get("result").toString(), "success");
 
-    timeSource.sleep(5000);
+    timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS));
 
     List<CapturedEvent> events = listenerEvents.get("srt");
     assertEquals(events.toString(), 3, events.size());
@@ -445,10 +446,15 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
     assertEquals(ops.toString(), 1, ops.size());
     assertEquals(ops.toString(), CollectionParams.CollectionAction.NONE, ops.get(0).getAction());
 
+    // wait for waitFor to elapse for all types of violations
+    timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds * 2, TimeUnit.SECONDS));
+
     listenerEvents.clear();
     finished = new CountDownLatch(1);
     started = new CountDownLatch(1);
 
+    log.info("## test single replicas.");
+
     // now allow single replicas
     setTriggerCommand = "{" +
         "'set-trigger' : {" +
@@ -459,6 +465,8 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
         "'collections' : '" + COLL1 + "'," +
         "'aboveRate' : 1.0," +
         "'belowRate' : 0.1," +
+        "'aboveNodeRate' : 1.0," +
+        "'belowNodeRate' : 0.1," +
         "'minReplicas' : 1," +
         "'belowNodeOp' : 'none'," +
         "'actions' : [" +
@@ -543,6 +551,9 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
         "'collections' : '" + COLL1 + "'," +
         "'aboveRate' : 1.0," +
         "'belowRate' : 0.1," +
+        // set limits to node rates
+        "'aboveNodeRate' : 1.0," +
+        "'belowNodeRate' : 0.1," +
         // allow deleting all spare replicas
         "'minReplicas' : 1," +
         // allow requesting all deletions in one event

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java
index 60b973f..0e9f4fb 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java
@@ -174,11 +174,11 @@ public class SearchRateTriggerTest extends SolrCloudTestCase {
       }
       Thread.sleep(waitForSeconds * 1000);
       trigger.run();
-      // should generate node and collection event but not for COLL2 because of waitFor
+      // should generate collection event but not for COLL2 because of waitFor
       assertEquals(1, events.size());
       event = events.get(0);
       Map<String, Double> hotNodes = (Map<String, Double>)event.getProperty(SearchRateTrigger.HOT_NODES);
-      assertEquals(3, hotNodes.size());
+      assertTrue("hotNodes", hotNodes.isEmpty());
       hotNodes.forEach((n, r) -> assertTrue(n, r > rate));
       hotCollections = (Map<String, Double>)event.getProperty(SearchRateTrigger.HOT_COLLECTIONS);
       assertEquals(1, hotCollections.size());
@@ -193,7 +193,7 @@ public class SearchRateTriggerTest extends SolrCloudTestCase {
 
       Thread.sleep(waitForSeconds * 1000 * 2);
       trigger.run();
-      // should generate node and collection event
+      // should generate collection event
       assertEquals(1, events.size());
       event = events.get(0);
       hotCollections = (Map<String, Double>)event.getProperty(SearchRateTrigger.HOT_COLLECTIONS);
@@ -203,8 +203,7 @@ public class SearchRateTriggerTest extends SolrCloudTestCase {
       Rate = hotCollections.get(COLL2);
       assertNotNull(Rate);
       hotNodes = (Map<String, Double>)event.getProperty(SearchRateTrigger.HOT_NODES);
-      assertEquals(3, hotNodes.size());
-      hotNodes.forEach((n, r) -> assertTrue(n, r > rate));
+      assertTrue("hotNodes", hotNodes.isEmpty());
     }
   }
 
@@ -266,7 +265,7 @@ public class SearchRateTriggerTest extends SolrCloudTestCase {
       hotCollections = (Map<String, Object>)event.properties.get(SearchRateTrigger.HOT_COLLECTIONS);
       hotShards = (Map<String, Object>)event.properties.get(SearchRateTrigger.HOT_SHARDS);
       hotReplicas = (List<ReplicaInfo>)event.properties.get(SearchRateTrigger.HOT_REPLICAS);
-      assertFalse("no hot nodes?", hotNodes.isEmpty());
+      assertTrue("no hot nodes?", hotNodes.isEmpty());
       assertFalse("no hot collections?", hotCollections.isEmpty());
       assertFalse("no hot shards?", hotShards.isEmpty());
       assertFalse("no hot replicas?", hotReplicas.isEmpty());

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
index abc3ccf..534ae71 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
@@ -599,12 +599,16 @@ public class SimClusterStateProvider implements ClusterStateProvider {
       if (!collections.contains(dc.getName())) {
         return;
       }
-      dc.getSlices().forEach(s ->
+      dc.getSlices().forEach(s -> {
+        if (s.getLeader() != null) {
+          return;
+        }
+        LOG.debug("-- submit leader election for {} / {}", dc.getName(), s.getName());
         cloudManager.submit(() -> {
           simRunLeaderElection(dc.getName(), s, saveClusterState);
           return true;
-        })
-      );
+        });
+      });
     });
   }
 
@@ -612,59 +616,71 @@ public class SimClusterStateProvider implements ClusterStateProvider {
     AtomicBoolean stateChanged = new AtomicBoolean(Boolean.FALSE);
     Replica leader = s.getLeader();
     if (leader == null || !liveNodes.contains(leader.getNodeName())) {
-      LOG.trace("Running leader election for " + collection + " / " + s.getName());
+      LOG.debug("Running leader election for {} / {}", collection, s.getName());
       if (s.getReplicas().isEmpty()) { // no replicas - punt
+        LOG.debug("-- no replicas in {} / {}", collection, s.getName());
         return;
       }
       ActionThrottle lt = getThrottle(collection, s.getName());
-      lt.minimumWaitBetweenActions();
-      lt.markAttemptingAction();
-
-      // mark all replicas as non-leader (probably not necessary) and collect all active and live
-      List<ReplicaInfo> active = new ArrayList<>();
-      s.getReplicas().forEach(r -> {
-        // find our ReplicaInfo for this replica
-        ReplicaInfo ri = getReplicaInfo(r);
-        if (ri == null) {
-          throw new IllegalStateException("-- could not find ReplicaInfo for replica " + r);
-        }
-        synchronized (ri) {
-          if (ri.getVariables().remove(ZkStateReader.LEADER_PROP) != null) {
-            stateChanged.set(true);
+      synchronized (lt) {
+        // collect all active and live
+        List<ReplicaInfo> active = new ArrayList<>();
+        AtomicBoolean alreadyHasLeader = new AtomicBoolean(false);
+        s.getReplicas().forEach(r -> {
+          // find our ReplicaInfo for this replica
+          ReplicaInfo ri = getReplicaInfo(r);
+          if (ri == null) {
+            throw new IllegalStateException("-- could not find ReplicaInfo for replica " + r);
           }
-          if (r.isActive(liveNodes.get())) {
-            active.add(ri);
-          } else { // if it's on a node that is not live mark it down
-            if (!liveNodes.contains(r.getNodeName())) {
-              ri.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.DOWN.toString());
-              stateChanged.set(true);
+          synchronized (ri) {
+            if (ri.getVariables().get(ZkStateReader.LEADER_PROP) != null) {
+              if (r.isActive(liveNodes.get())) {
+                alreadyHasLeader.set(true);
+                return;
+              }
+            }
+            if (r.isActive(liveNodes.get())) {
+              active.add(ri);
+            } else { // if it's on a node that is not live mark it down
+              if (!liveNodes.contains(r.getNodeName())) {
+                ri.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.DOWN.toString());
+                ri.getVariables().remove(ZkStateReader.LEADER_PROP);
+                stateChanged.set(true);
+              }
             }
           }
+        });
+        if (active.isEmpty()) {
+          LOG.warn("-- can't find any active replicas for " + collection + " / " + s.getName());
+          return;
         }
-      });
-      if (active.isEmpty()) {
-        LOG.warn("-- can't find any active replicas for " + collection + " / " + s.getName());
-        return;
-      }
-      // pick first active one
-      ReplicaInfo ri = null;
-      for (ReplicaInfo a : active) {
-        if (!a.getType().equals(Replica.Type.PULL)) {
-          ri = a;
-          break;
+        if (alreadyHasLeader.get()) {
+          LOG.debug("-- already has leader {} / {}", collection, s.getName());
+          return;
         }
+        // pick first active one
+        ReplicaInfo ri = null;
+        for (ReplicaInfo a : active) {
+          if (!a.getType().equals(Replica.Type.PULL)) {
+            ri = a;
+            break;
+          }
+        }
+        if (ri == null) {
+          LOG.warn("-- can't find any suitable replica type for " + collection + " / " + s.getName());
+          return;
+        }
+        // now mark the leader election throttle
+        lt.minimumWaitBetweenActions();
+        lt.markAttemptingAction();
+        synchronized (ri) {
+          ri.getVariables().put(ZkStateReader.LEADER_PROP, "true");
+        }
+        stateChanged.set(true);
+        LOG.debug("-- elected new leader for " + collection + " / " + s.getName() + ": " + ri);
       }
-      if (ri == null) {
-        LOG.warn("-- can't find any suitable replica type for " + collection + " / " + s.getName());
-        return;
-      }
-      synchronized (ri) {
-        ri.getVariables().put(ZkStateReader.LEADER_PROP, "true");
-      }
-      stateChanged.set(true);
-      LOG.debug("-- elected new leader for " + collection + " / " + s.getName() + ": " + ri);
     } else {
-      LOG.trace("-- already has leader for {} / {}", collection, s.getName());
+      LOG.debug("-- already has leader for {} / {}", collection, s.getName());
     }
     if (stateChanged.get() || saveState) {
       collectionsStatesRef.set(null);
@@ -781,7 +797,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
         ReplicaInfo ri = new ReplicaInfo("core_node" + Assign.incAndGetId(stateManager, collectionName, 0),
             coreName, collectionName, pos.shard, pos.type, pos.node, replicaProps);
         cloudManager.submit(() -> {
-          simAddReplica(pos.node, ri, false);
+          simAddReplica(pos.node, ri, true);
           finalStateLatch.countDown();
           return true;
         });
@@ -814,7 +830,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
 
     // force recreation of collection states
     collectionsStatesRef.set(null);
-    simRunLeaderElection(Collections.singleton(collectionName), true);
+    //simRunLeaderElection(Collections.singleton(collectionName), true);
     if (waitForFinalState) {
       boolean finished = finalStateLatch.await(cloudManager.getTimeSource().convertDelay(TimeUnit.SECONDS, 60, TimeUnit.MILLISECONDS),
           TimeUnit.MILLISECONDS);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
index cb8640c..ee87044 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
@@ -30,6 +30,8 @@ import java.util.TreeSet;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.locks.ReentrantLock;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 import org.apache.solr.client.solrj.cloud.NodeStateProvider;
@@ -230,6 +232,7 @@ public class SimNodeStateProvider implements NodeStateProvider {
     }
   }
 
+  private static final Pattern REGISTRY_PATTERN = Pattern.compile("^solr\\.core\\.([\\w.-]+?)\\.(shard[\\d]+?)\\.(replica.*)");
   /**
    * Simulate getting replica metrics values. This uses per-replica properties set in
    * {@link SimClusterStateProvider#simSetCollectionValue(String, String, Object, boolean, boolean)} and
@@ -257,14 +260,15 @@ public class SimNodeStateProvider implements NodeStateProvider {
         // skip - this is probably solr.node or solr.jvm metric
         continue;
       }
-      String[] collParts = parts[1].substring(10).split("\\.");
-      if (collParts.length != 3) {
+      Matcher m = REGISTRY_PATTERN.matcher(parts[1]);
+
+      if (!m.matches()) {
         LOG.warn("Invalid registry name: " + parts[1]);
         continue;
       }
-      String collection = collParts[0];
-      String shard = collParts[1];
-      String replica = collParts[2];
+      String collection = m.group(1);
+      String shard = m.group(2);
+      String replica = m.group(3);
       String key = parts.length > 3 ? parts[2] + ":" + parts[3] : parts[2];
       replicas.forEach(r -> {
         if (r.getCollection().equals(collection) && r.getShard().equals(shard) && r.getCore().endsWith(replica)) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java
index 9edf0ed..c1f3a56 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java
@@ -147,7 +147,7 @@ public class TestComputePlanAction extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     CloudTestUtils.waitForState(cluster, "Timed out waiting for replicas of new collection to be active",
-        "testNodeLost", CloudTestUtils.clusterShape(1, 2));
+        "testNodeLost", CloudTestUtils.clusterShape(1, 2, false, true));
 
     ClusterState clusterState = cluster.getClusterStateProvider().getClusterState();
     log.debug("-- cluster state: {}", clusterState);
@@ -210,7 +210,7 @@ public class TestComputePlanAction extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     CloudTestUtils.waitForState(cluster, "Timed out waiting for replicas of new collection to be active",
-        "testNodeWithMultipleReplicasLost", CloudTestUtils.clusterShape(2, 3));
+        "testNodeWithMultipleReplicasLost", CloudTestUtils.clusterShape(2, 3, false, true));
 
     ClusterState clusterState = cluster.getClusterStateProvider().getClusterState();
     log.debug("-- cluster state: {}", clusterState);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java
index 36d72b8..796dc3b 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java
@@ -90,7 +90,7 @@ public class TestExecutePlanAction extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     log.info("Collection ready after " + CloudTestUtils.waitForState(cluster, collectionName, 120, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(1, 2)) + "ms");
+        CloudTestUtils.clusterShape(1, 2, false, true)) + "ms");
 
     String sourceNodeName = cluster.getSimClusterStateProvider().simGetRandomNode(random());
     ClusterState clusterState = cluster.getClusterStateProvider().getClusterState();
@@ -152,7 +152,7 @@ public class TestExecutePlanAction extends SimSolrCloudTestCase {
     }
 
     log.info("Collection ready after " + CloudTestUtils.waitForState(cluster, collectionName, 300, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(1, 2)) + "ms");
+        CloudTestUtils.clusterShape(1, 2, false, true)) + "ms");
   }
 
   @Test
@@ -179,7 +179,7 @@ public class TestExecutePlanAction extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     CloudTestUtils.waitForState(cluster, "Timed out waiting for replicas of new collection to be active",
-        collectionName, CloudTestUtils.clusterShape(1, 2));
+        collectionName, CloudTestUtils.clusterShape(1, 2, false, true));
 
     String sourceNodeName = cluster.getSimClusterStateProvider().simGetRandomNode(random());
     ClusterState clusterState = cluster.getClusterStateProvider().getClusterState();
@@ -195,8 +195,10 @@ public class TestExecutePlanAction extends SimSolrCloudTestCase {
 
     cluster.simRemoveNode(sourceNodeName, false);
 
+    cluster.getTimeSource().sleep(3000);
+
     CloudTestUtils.waitForState(cluster, "Timed out waiting for replicas of collection to be 2 again",
-        collectionName, CloudTestUtils.clusterShape(1, 2));
+        collectionName, CloudTestUtils.clusterShape(1, 2, false, true));
 
     clusterState = cluster.getClusterStateProvider().getClusterState();
     docCollection = clusterState.getCollection(collectionName);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
index 61fedb8..c8ec6fa 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
@@ -87,6 +87,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
   static Map<String, List<CapturedEvent>> listenerEvents = new ConcurrentHashMap<>();
   static AtomicInteger triggerFinishedCount = new AtomicInteger();
   static AtomicInteger triggerStartedCount = new AtomicInteger();
+  static CountDownLatch triggerStartedLatch;
   static CountDownLatch triggerFinishedLatch;
   static int waitForSeconds;
 
@@ -100,6 +101,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     waitForSeconds = 5;
     triggerStartedCount.set(0);
     triggerFinishedCount.set(0);
+    triggerStartedLatch = new CountDownLatch(1);
     triggerFinishedLatch = new CountDownLatch(1);
     listenerEvents.clear();
     // disable .scheduled_maintenance and .auto_add_replicas
@@ -150,6 +152,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
   public static class StartTriggerAction extends TriggerActionBase {
     @Override
     public void process(TriggerEvent event, ActionContext context) throws Exception {
+      triggerStartedLatch.countDown();
       triggerStartedCount.incrementAndGet();
     }
   }
@@ -212,7 +215,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(5, 15)) + "ms");
+        CloudTestUtils.clusterShape(5, 15, false, true)) + "ms");
 
     int KILL_NODES = 8;
     // kill off a number of nodes
@@ -221,7 +224,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     }
     // should fully recover
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 90 * KILL_NODES, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(5, 15)) + "ms");
+        CloudTestUtils.clusterShape(5, 15, false, true)) + "ms");
 
     log.info("OP COUNTS: " + cluster.simGetOpCounts());
     long moveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name());
@@ -256,7 +259,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
 
 
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(5, 15)) + "ms");
+        CloudTestUtils.clusterShape(5, 15, false, true)) + "ms");
     long newMoveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name());
     log.info("==== Flaky replicas: {}. Additional MOVEREPLICA count: {}", flakyReplicas, (newMoveReplicaOps - moveReplicaOps));
     // flaky nodes lead to a number of MOVEREPLICA that is non-zero but lower than the number of flaky replicas
@@ -295,7 +298,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3)) + " ms");
+        CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3, false, true)) + " ms");
 
     // start adding nodes
     int numAddNode = NUM_NODES / 5;
@@ -305,7 +308,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
       cluster.getTimeSource().sleep(5000);
     }
     // wait until at least one event is generated
-    boolean await = triggerFinishedLatch.await(10000 / SPEED, TimeUnit.MILLISECONDS);
+    boolean await = triggerStartedLatch.await(20000 / SPEED, TimeUnit.MILLISECONDS);
     assertTrue("trigger did not fire", await);
 
     // wait until started == finished
@@ -340,7 +343,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     assertTrue("no MOVEREPLICA ops?", cluster.simGetOpCount("MOVEREPLICA") > 0);
 
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3)) + " ms");
+        CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3, false, true)) + " ms");
 
     int count = 50;
     SolrInputDocument finishedEvent = null;
@@ -471,6 +474,20 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     NamedList<Object> response = solrClient.request(req);
     assertEquals(response.get("result").toString(), "success");
 
+    String setListenerCommand = "{" +
+        "'set-listener' : " +
+        "{" +
+        "'name' : 'failures'," +
+        "'trigger' : 'node_lost_trigger3'," +
+        "'stage' : ['FAILED']," +
+        "'class' : '" + TestTriggerListener.class.getName() + "'" +
+        "}" +
+        "}";
+    req = createAutoScalingRequest(SolrRequest.METHOD.POST, setListenerCommand);
+    response = solrClient.request(req);
+    assertEquals(response.get("result").toString(), "success");
+
+
     // create a collection with 1 replica per node
     String collectionName = "testNodeLost";
     CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionName,
@@ -480,7 +497,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(NUM_NODES / 5, NUM_NODES / 10)) + " ms");
+        CloudTestUtils.clusterShape(NUM_NODES / 5, NUM_NODES / 10, false, true)) + " ms");
 
     // start killing nodes
     int numNodes = NUM_NODES / 5;
@@ -491,7 +508,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
       cluster.simRemoveNode(nodes.get(i), false);
       cluster.getTimeSource().sleep(killDelay);
     }
-    // wait for the trigger to fire at least once
+    // wait for the trigger to fire and complete at least once
     boolean await = triggerFinishedLatch.await(20 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS);
     assertTrue("trigger did not fire within timeout, " +
         "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
@@ -513,16 +530,23 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
             "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
           startedEventPos > -1);
     SolrInputDocument startedEvent = systemColl.get(startedEventPos);
-    // wait until started == finished
+    // we can expect some failures when target node in MOVEREPLICA has been killed
+    // between when the event processing started and the actual moment of MOVEREPLICA execution
+    // wait until started == (finished + failed)
     TimeOut timeOut = new TimeOut(20 * waitFor * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource());
     while (!timeOut.hasTimedOut()) {
       if (triggerStartedCount.get() == triggerFinishedCount.get()) {
         break;
       }
+      log.debug("started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get() +
+          ", failed=" + listenerEvents.size());
       timeOut.sleep(1000);
     }
     if (timeOut.hasTimedOut()) {
-      fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get());
+      if (triggerStartedCount.get() > triggerFinishedCount.get() + listenerEvents.size()) {
+        fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get() +
+            ", failed=" + listenerEvents.size());
+      }
     }
     int ignored = 0;
     int lastIgnoredPos = startedEventPos;
@@ -547,7 +571,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
             cluster.simGetOpCount("MOVEREPLICA") > 0);
 
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(NUM_NODES / 5, NUM_NODES / 10)) + " ms");
+        CloudTestUtils.clusterShape(NUM_NODES / 5, NUM_NODES / 10, false, true)) + " ms");
 
     int count = 50;
     SolrInputDocument finishedEvent = null;
@@ -596,7 +620,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     create.process(solrClient);
 
     log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 300, TimeUnit.SECONDS,
-        CloudTestUtils.clusterShape(2, 10)) + " ms");
+        CloudTestUtils.clusterShape(2, 10, false, true)) + " ms");
 
     // collect the node names for shard1
     Set<String> nodes = new HashSet<>();
@@ -617,6 +641,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
         "'event' : 'searchRate'," +
         "'waitFor' : '" + waitForSeconds + "s'," +
         "'aboveRate' : 1.0," +
+        "'aboveNodeRate' : 1.0," +
         "'enabled' : true," +
         "'actions' : [" +
         "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
@@ -655,7 +680,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
     m.forEach((k, v) -> assertEquals(4.0, v.doubleValue(), 0.01));
     List<TriggerEvent.Op> ops = (List<TriggerEvent.Op>)ev.event.getProperty(TriggerEvent.REQUESTED_OPS);
     assertNotNull(ops);
-    assertEquals(3, ops.size());
+    assertEquals(ops.toString(), 1, ops.size());
     ops.forEach(op -> {
       assertEquals(CollectionParams.CollectionAction.ADDREPLICA, op.getAction());
       assertEquals(1, op.getHints().size());

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java
index 1adbabb..3237639 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java
@@ -69,7 +69,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
         .process(solrClient);
 
     CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", "perReplicaDataColl",
-        CloudTestUtils.clusterShape(1, 5));
+        CloudTestUtils.clusterShape(1, 5, false, true));
     DocCollection coll = getCollectionState("perReplicaDataColl");
     String autoScaleJson = "{" +
         "  'cluster-preferences': [" +
@@ -119,7 +119,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
         .setPolicy("c1")
         .process(solrClient);
     CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", collectionName,
-        CloudTestUtils.clusterShape(1, 1));
+        CloudTestUtils.clusterShape(1, 1, false, true));
 
     getCollectionState(collectionName).forEachReplica((s, replica) -> assertEquals(nodeId, replica.getNodeName()));
 
@@ -152,7 +152,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
         .setPolicy("c1")
         .process(solrClient);
     CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", collectionName,
-        CloudTestUtils.clusterShape(1, 2));
+        CloudTestUtils.clusterShape(1, 2, false, true));
 
     DocCollection docCollection = getCollectionState(collectionName);
     List<Replica> list = docCollection.getReplicas(firstNode);
@@ -257,7 +257,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
     CollectionAdminRequest.createCollectionWithImplicitRouter("policiesTest", "conf", "s1", 1, 1, 1)
         .process(solrClient);
     CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", "policiesTest",
-        CloudTestUtils.clusterShape(1, 3));
+        CloudTestUtils.clusterShape(1, 3, false, true));
 
     DocCollection coll = getCollectionState("policiesTest");
 
@@ -322,7 +322,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
     CollectionAdminRequest.createCollectionWithImplicitRouter("policiesTest", "conf", "shard1", 2)
         .process(solrClient);
     CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", "policiesTest",
-        CloudTestUtils.clusterShape(1, 2));
+        CloudTestUtils.clusterShape(1, 2, false, true));
     DocCollection rulesCollection = getCollectionState("policiesTest");
 
     Map<String, Object> val = cluster.getNodeStateProvider().getNodeValues(rulesCollection.getReplicas().get(0).getNodeName(), Arrays.asList(

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestTriggerIntegration.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestTriggerIntegration.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestTriggerIntegration.java
index 0a28a44..91bc3c2 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestTriggerIntegration.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestTriggerIntegration.java
@@ -1160,7 +1160,7 @@ public class TestTriggerIntegration extends SimSolrCloudTestCase {
     CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(COLL1,
         "conf", 1, 2);
     create.process(solrClient);
-    CloudTestUtils.waitForState(cluster, COLL1, 10, TimeUnit.SECONDS, CloudTestUtils.clusterShape(1, 2));
+    CloudTestUtils.waitForState(cluster, COLL1, 10, TimeUnit.SECONDS, CloudTestUtils.clusterShape(1, 2, false, true));
 
     String setTriggerCommand = "{" +
         "'set-trigger' : {" +
@@ -1169,6 +1169,7 @@ public class TestTriggerIntegration extends SimSolrCloudTestCase {
         "'waitFor' : '" + waitForSeconds + "s'," +
         "'enabled' : true," +
         "'aboveRate' : 1.0," +
+        "'aboveNodeRate' : 1.0," +
         "'actions' : [" +
         "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}" +
         "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" +

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d8fb6916/solr/solr-ref-guide/src/solrcloud-autoscaling-triggers.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/solrcloud-autoscaling-triggers.adoc b/solr/solr-ref-guide/src/solrcloud-autoscaling-triggers.adoc
index 0e77c11..e7aaf64 100644
--- a/solr/solr-ref-guide/src/solrcloud-autoscaling-triggers.adoc
+++ b/solr/solr-ref-guide/src/solrcloud-autoscaling-triggers.adoc
@@ -194,14 +194,31 @@ Similarly, if the search rate falls below a threshold then the trigger may reque
 replicas are deleted from "cold" shards. It can also optionally issue node-level action requests
 when a cumulative node-level rate falls below a threshold.
 
+Per-shard rates are calculated as arithmetic average of rates of all searchable replicas in a given shard.
+This method was chosen to avoid generating false events when a simple client keeps sending requests
+to a single specific replica (because adding or removing other replicas can't solve this situation,
+only proper load balancing can - either by using `CloudSolrClient` or another load-balancing client).
+
 Note: this trigger calculates node-level cumulative rates using per-replica rates reported by
-replicas that are part of monitored collections / shards. This means that it may report
+replicas that are part of monitored collections / shards on each node. This means that it may report
 some nodes as "cold" (underutilized) because it ignores other, perhaps more active, replicas
 belonging to other collections. Also, nodes that don't host any of the monitored replicas or
 those that are explicitly excluded by `node` config property won't be reported at all.
 
-
-This trigger supports the following configuration:
+Note 2: special care should be taken when configuring `waitFor` property. By default the trigger
+monitors a 1-min average search rate of a replica. Changes to the number of replicas that should in turn
+change per-replica search rates may be requested and executed relatively quickly if the
+`waitFor` is set to comparable values of 1 min or shorter. However, the metric value, being a
+moving average, will always lag behind the new "momentary" rate after the changes. This in turn means that
+the monitored metric may not change sufficiently enough to prevent the
+trigger from firing again (because it will continue to measure the average rate as still violating
+the threshold for some time after the change was executed). As a result the trigger may keep
+requesting that even more replicas be added (or removed) and thus it may "overshoot" the optimal number of replicas.
+For this reason it's recommended to always set `waitFor` to values several
+times longer than the time constant of the used metric. For example, with the default 1-min average the
+`waitFor` should be set to at least `2m` or more.
+
+This trigger supports the following configuration properties:
 
 `collections`:: (string, optional) comma-separated list of collection names to monitor, or any collection if empty / not set.
 
@@ -211,7 +228,9 @@ This trigger supports the following configuration:
 
 `metric`:: (string, optional) metric name that represents the search rate
 (default is `QUERY./select.requestTimes:1minRate`). This name has to identify a single numeric
-metric value, and it may use the colon syntax for selecting one property of a complex metric.
+metric value, and it may use the colon syntax for selecting one property of a complex metric. This value
+is collected from all replicas for a shard, and then an arithmetic average is calculated per shard
+to determine shard-level violations.
 
 `maxOps`:: (integer, optional) maximum number of add replica / delete replica operations
 requested in a single autoscaling event. The default value is 3 and it helps to smooth out
@@ -229,12 +248,18 @@ the value is set to 1. Note also that shard leaders are never deleted.
 `belowRate`:: (float) the lower bound for the request rate metric value. At least one of
 `aboveRate` or `belowRate` must be set.
 
+`aboveNodeRate`:: (float) the upper bound for the total request rate metric value per node. If not
+set then cumulative per-node rates will be ignored.
+
+`belowNodeRate`:: (float) the lower bound for the total request rate metric value per node. If not
+set then cumulative per-node rates will be ignored.
+
 `aboveOp`:: (string, optional) collection action to request when the upper threshold for a shard or replica is
 exceeded. Default action is `ADDREPLICA` and the trigger will request from 1 up to `maxOps` operations
 per shard per event, proportionally to how much the rate is exceeded. This property can be set to 'NONE'
 to effectively disable the action but still report it to the listeners.
 
-`aboveNodeOp`:: (string, optional) collection action to request when the upper threshold for a node is exceeded.
+`aboveNodeOp`:: (string, optional) collection action to request when the upper threshold for a node (`aboveNodeRate`) is exceeded.
 Default action is `MOVEREPLICA`, and the trigger will request 1 replica operation per hot node per event.
 If both `aboveOp` and `aboveNodeOp` operations are requested then `aboveNodeOp` operations are
 always requested first. This property can be set to 'NONE' to effectively disable the action but still
@@ -245,7 +270,7 @@ exceeded. Default action is `DELETEREPLICA`, and the trigger will request at mos
 to be deleted from eligible cold shards. This property can be set to 'NONE'
 to effectively disable the action but still report it to the listeners.
 
-`belowNodeOp`:: action to request when the lower threshold for a node is exceeded.
+`belowNodeOp`:: action to request when the lower threshold for a node (`belowNodeRate`) is exceeded.
 Default action is null (not set) and the condition is ignored, because in many cases the
 trigger will monitor only some selected resources (replicas from selected
 collections / shards) so setting this by default to e.g., `DELETENODE` could interfere with
@@ -269,6 +294,7 @@ request node deletion.
   "metric" : "QUERY./select.requestTimes:5minRate",
   "aboveRate" : 100.0,
   "belowRate" : 0.01,
+  "belowNodeRate" : 0.01,
   "belowNodeOp" : "DELETENODE",
   "minReplicas" : 1,
   "waitFor" : "20m",