You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/08/21 01:30:38 UTC
[11/50] [abbrv] lucene-solr:jira/http2: SOLR-12470: Search Rate
Trigger multiple bug fixes, improvements and documentation updates.
SOLR-12470: Search Rate Trigger multiple bug fixes, improvements and documentation updates.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/8dd704ef
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/8dd704ef
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/8dd704ef
Branch: refs/heads/jira/http2
Commit: 8dd704ef78e5a2c3bf7c3d206d44a2d971c758fe
Parents: 7ecf9b6
Author: Andrzej Bialecki <ab...@apache.org>
Authored: Tue Aug 14 20:41:16 2018 +0200
Committer: Andrzej Bialecki <ab...@apache.org>
Committed: Tue Aug 14 20:41:42 2018 +0200
----------------------------------------------------------------------
solr/CHANGES.txt | 2 +
.../org/apache/solr/cloud/ActionThrottle.java | 2 +-
.../cloud/autoscaling/IndexSizeTrigger.java | 3 +
.../cloud/autoscaling/ScheduledTriggers.java | 7 +-
.../cloud/autoscaling/SearchRateTrigger.java | 173 ++++++++++++++-----
.../autoscaling/TriggerActionException.java | 33 ++++
.../apache/solr/cloud/ActionThrottleTest.java | 2 +-
.../org/apache/solr/cloud/CloudTestUtils.java | 29 +++-
.../cloud/autoscaling/IndexSizeTriggerTest.java | 25 +--
.../ScheduledMaintenanceTriggerTest.java | 2 +-
.../SearchRateTriggerIntegrationTest.java | 41 +++--
.../autoscaling/SearchRateTriggerTest.java | 11 +-
.../sim/SimClusterStateProvider.java | 158 +++++++++++------
.../autoscaling/sim/SimNodeStateProvider.java | 14 +-
.../autoscaling/sim/TestComputePlanAction.java | 4 +-
.../autoscaling/sim/TestExecutePlanAction.java | 10 +-
.../cloud/autoscaling/sim/TestLargeCluster.java | 89 +++++++---
.../cloud/autoscaling/sim/TestPolicyCloud.java | 10 +-
.../autoscaling/sim/TestTriggerIntegration.java | 51 +++++-
.../src/solrcloud-autoscaling-triggers.adoc | 49 ++++--
20 files changed, 519 insertions(+), 196 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 08499e6..d165e21 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -233,6 +233,8 @@ Bug Fixes
* SOLR-12541: Metrics handler throws an error if there are transient cores. (ab)
+* SOLR-12470: Search Rate Trigger multiple bug fixes, improvements and documentation updates. (ab)
+
Optimizations
----------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java b/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java
index f260c8c..1724b53 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java
@@ -82,7 +82,7 @@ public class ActionThrottle {
if (sleep > 0) {
log.info("Throttling {} attempts - waiting for {}ms", name, sleep);
try {
- Thread.sleep(sleep);
+ timeSource.sleep(sleep);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java
index f729864..3f2ea8a 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java
@@ -407,6 +407,9 @@ public class IndexSizeTrigger extends TriggerBase {
replicas.forEach(r -> lastAboveEventMap.put(r.getCore(), now));
});
belowSize.forEach((coll, replicas) -> {
+ if (replicas.size() < 2) {
+ return;
+ }
lastBelowEventMap.put(replicas.get(0).getCore(), now);
lastBelowEventMap.put(replicas.get(1).getCore(), now);
});
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
index c125209..7c3cbb0 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
@@ -308,7 +308,7 @@ public class ScheduledTriggers implements Closeable {
action.process(event, actionContext);
} catch (Exception e) {
triggerListeners1.fireListeners(event.getSource(), event, TriggerEventProcessorStage.FAILED, action.getName(), actionContext, e, null);
- throw new Exception("Error executing action: " + action.getName() + " for trigger event: " + event, e);
+ throw new TriggerActionException(event.getSource(), action.getName(), "Error processing action for trigger event: " + event, e);
}
List<String> afterActions = (List<String>) actionContext.getProperties().computeIfAbsent(TriggerEventProcessorStage.AFTER_ACTION.toString(), k -> new ArrayList<String>());
afterActions.add(action.getName());
@@ -319,8 +319,11 @@ public class ScheduledTriggers implements Closeable {
assert ev.getId().equals(event.getId());
}
triggerListeners1.fireListeners(event.getSource(), event, TriggerEventProcessorStage.SUCCEEDED);
- } catch (Exception e) {
+ } catch (TriggerActionException e) {
log.warn("Exception executing actions", e);
+ } catch (Exception e) {
+ triggerListeners1.fireListeners(event.getSource(), event, TriggerEventProcessorStage.FAILED);
+ log.warn("Unhandled exception executing actions", e);
} finally {
cooldownStart.set(cloudManager.getTimeSource().getTimeNs());
hasPendingActions.set(false);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
index 1824f7f..81d56d3 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
@@ -63,6 +63,8 @@ public class SearchRateTrigger extends TriggerBase {
public static final String MIN_REPLICAS_PROP = "minReplicas";
public static final String ABOVE_RATE_PROP = "aboveRate";
public static final String BELOW_RATE_PROP = "belowRate";
+ public static final String ABOVE_NODE_RATE_PROP = "aboveNodeRate";
+ public static final String BELOW_NODE_RATE_PROP = "belowNodeRate";
public static final String ABOVE_OP_PROP = "aboveOp";
public static final String BELOW_OP_PROP = "belowOp";
public static final String ABOVE_NODE_OP_PROP = "aboveNodeOp";
@@ -81,6 +83,7 @@ public class SearchRateTrigger extends TriggerBase {
public static final String COLD_COLLECTIONS = "coldCollections";
public static final String COLD_SHARDS = "coldShards";
public static final String COLD_REPLICAS = "coldReplicas";
+ public static final String VIOLATION_PROP = "violationType";
public static final int DEFAULT_MAX_OPS = 3;
public static final String DEFAULT_METRIC = "QUERY./select.requestTimes:1minRate";
@@ -93,6 +96,8 @@ public class SearchRateTrigger extends TriggerBase {
private String node;
private double aboveRate;
private double belowRate;
+ private double aboveNodeRate;
+ private double belowNodeRate;
private CollectionParams.CollectionAction aboveOp, belowOp, aboveNodeOp, belowNodeOp;
private final Map<String, Long> lastCollectionEvent = new ConcurrentHashMap<>();
private final Map<String, Long> lastNodeEvent = new ConcurrentHashMap<>();
@@ -117,6 +122,8 @@ public class SearchRateTrigger extends TriggerBase {
BELOW_NODE_OP_PROP,
ABOVE_RATE_PROP,
BELOW_RATE_PROP,
+ ABOVE_NODE_RATE_PROP,
+ BELOW_NODE_RATE_PROP,
// back-compat props
BC_COLLECTION_PROP,
BC_RATE_PROP);
@@ -192,6 +199,28 @@ public class SearchRateTrigger extends TriggerBase {
belowRate = -1;
}
+ // node rates
+ above = properties.get(ABOVE_NODE_RATE_PROP);
+ below = properties.get(BELOW_NODE_RATE_PROP);
+ if (above != null) {
+ try {
+ aboveNodeRate = Double.parseDouble(String.valueOf(above));
+ } catch (Exception e) {
+ throw new TriggerValidationException(name, ABOVE_NODE_RATE_PROP, "Invalid configuration value: '" + above + "': " + e.toString());
+ }
+ } else {
+ aboveNodeRate = Double.MAX_VALUE;
+ }
+ if (below != null) {
+ try {
+ belowNodeRate = Double.parseDouble(String.valueOf(below));
+ } catch (Exception e) {
+ throw new TriggerValidationException(name, BELOW_NODE_RATE_PROP, "Invalid configuration value: '" + below + "': " + e.toString());
+ }
+ } else {
+ belowNodeRate = -1;
+ }
+
String aboveOpStr = String.valueOf(properties.getOrDefault(ABOVE_OP_PROP, CollectionParams.CollectionAction.ADDREPLICA.toLower()));
String belowOpStr = String.valueOf(properties.getOrDefault(BELOW_OP_PROP, CollectionParams.CollectionAction.DELETEREPLICA.toLower()));
aboveOp = CollectionParams.CollectionAction.get(aboveOpStr);
@@ -231,6 +260,8 @@ public class SearchRateTrigger extends TriggerBase {
config.put(MIN_REPLICAS_PROP, minReplicas);
config.put(ABOVE_RATE_PROP, aboveRate);
config.put(BELOW_RATE_PROP, belowRate);
+ config.put(ABOVE_NODE_RATE_PROP, aboveNodeRate);
+ config.put(BELOW_NODE_RATE_PROP, belowNodeRate);
config.put(ABOVE_OP_PROP, aboveOp);
config.put(ABOVE_NODE_OP_PROP, aboveNodeOp);
config.put(BELOW_OP_PROP, belowOp);
@@ -339,6 +370,10 @@ public class SearchRateTrigger extends TriggerBase {
continue;
}
Map<String, Object> rates = cloudManager.getNodeStateProvider().getNodeValues(node, metricTags.keySet());
+ if (log.isDebugEnabled()) {
+ log.debug("### rates for node " + node);
+ rates.forEach((tag, rate) -> log.debug("### " + tag + "\t" + rate));
+ }
rates.forEach((tag, rate) -> {
ReplicaInfo info = metricTags.get(tag);
if (info == null) {
@@ -355,18 +390,28 @@ public class SearchRateTrigger extends TriggerBase {
});
}
+ if (log.isDebugEnabled()) {
+ collectionRates.forEach((coll, collRates) -> {
+ log.debug("## Collection: {}", coll);
+ collRates.forEach((s, replicas) -> {
+ log.debug("## - {}", s);
+ replicas.forEach(ri -> log.debug("## {} {}", ri.getCore(), ri.getVariable(AutoScalingParams.RATE)));
+ });
+ });
+ }
long now = cloudManager.getTimeSource().getTimeNs();
Map<String, Double> hotNodes = new HashMap<>();
Map<String, Double> coldNodes = new HashMap<>();
+
// check for exceeded rates and filter out those with less than waitFor from previous events
nodeRates.entrySet().stream()
.filter(entry -> node.equals(Policy.ANY) || node.equals(entry.getKey()))
.forEach(entry -> {
- if (entry.getValue().get() > aboveRate) {
+ if (entry.getValue().get() > aboveNodeRate) {
if (waitForElapsed(entry.getKey(), now, lastNodeEvent)) {
hotNodes.put(entry.getKey(), entry.getValue().get());
}
- } else if (entry.getValue().get() < belowRate) {
+ } else if (entry.getValue().get() < belowNodeRate) {
if (waitForElapsed(entry.getKey(), now, lastNodeEvent)) {
coldNodes.put(entry.getKey(), entry.getValue().get());
}
@@ -383,7 +428,7 @@ public class SearchRateTrigger extends TriggerBase {
List<ReplicaInfo> coldReplicas = new ArrayList<>();
collectionRates.forEach((coll, shardRates) -> {
shardRates.forEach((sh, replicaRates) -> {
- double shardRate = replicaRates.stream()
+ double totalShardRate = replicaRates.stream()
.map(r -> {
String elapsedKey = r.getCollection() + "." + r.getCore();
if ((Double)r.getVariable(AutoScalingParams.RATE) > aboveRate) {
@@ -401,7 +446,10 @@ public class SearchRateTrigger extends TriggerBase {
return r;
})
.mapToDouble(r -> (Double)r.getVariable(AutoScalingParams.RATE)).sum();
+ // calculate average shard rate over all searchable replicas (see SOLR-12470)
+ double shardRate = totalShardRate / searchableReplicationFactors.get(coll).get(sh).doubleValue();
String elapsedKey = coll + "." + sh;
+ log.debug("-- {}: totalShardRate={}, shardRate={}", elapsedKey, totalShardRate, shardRate);
if ((collections.isEmpty() || collections.contains(coll)) &&
(shard.equals(Policy.ANY) || shard.equals(sh))) {
if (shardRate > aboveRate) {
@@ -411,6 +459,13 @@ public class SearchRateTrigger extends TriggerBase {
} else if (shardRate < belowRate) {
if (waitForElapsed(elapsedKey, now, lastShardEvent)) {
coldShards.computeIfAbsent(coll, s -> new HashMap<>()).put(sh, shardRate);
+ log.debug("-- coldShard waitFor elapsed {}", elapsedKey);
+ } else {
+ if (log.isDebugEnabled()) {
+ Long lastTime = lastShardEvent.computeIfAbsent(elapsedKey, s -> now);
+ long elapsed = TimeUnit.SECONDS.convert(now - lastTime, TimeUnit.NANOSECONDS);
+ log.debug("-- waitFor didn't elapse for {}, waitFor={}, elapsed={}", elapsedKey, getWaitForSecond(), elapsed);
+ }
}
} else {
// no violation - clear waitForElapsed
@@ -511,9 +566,10 @@ public class SearchRateTrigger extends TriggerBase {
});
final List<TriggerEvent.Op> ops = new ArrayList<>();
+ final Set<String> violations = new HashSet<>();
- calculateHotOps(ops, searchableReplicationFactors, hotNodes, hotCollections, hotShards, hotReplicas);
- calculateColdOps(ops, clusterState, searchableReplicationFactors, coldNodes, coldCollections, coldShards, coldReplicas);
+ calculateHotOps(ops, violations, searchableReplicationFactors, hotNodes, hotCollections, hotShards, hotReplicas);
+ calculateColdOps(ops, violations, clusterState, searchableReplicationFactors, coldNodes, coldCollections, coldShards, coldReplicas);
if (ops.isEmpty()) {
return;
@@ -521,7 +577,7 @@ public class SearchRateTrigger extends TriggerBase {
if (processor.process(new SearchRateEvent(getName(), eventTime.get(), ops,
hotNodes, hotCollections, hotShards, hotReplicas,
- coldNodes, coldCollections, coldShards, coldReplicas))) {
+ coldNodes, coldCollections, coldShards, coldReplicas, violations))) {
// update lastEvent times
hotNodes.keySet().forEach(node -> lastNodeEvent.put(node, now));
coldNodes.keySet().forEach(node -> lastNodeEvent.put(node, now));
@@ -537,6 +593,7 @@ public class SearchRateTrigger extends TriggerBase {
}
private void calculateHotOps(List<TriggerEvent.Op> ops,
+ Set<String> violations,
Map<String, Map<String, AtomicInteger>> searchableReplicationFactors,
Map<String, Double> hotNodes,
Map<String, Double> hotCollections,
@@ -545,40 +602,47 @@ public class SearchRateTrigger extends TriggerBase {
// calculate the number of replicas to add to each hot shard, based on how much the rate was
// exceeded - but within limits.
- // first resolve a situation when only a node is hot but no collection / shard / replica is hot
+ // first resolve a situation when only a node is hot but no collection / shard is hot
// TODO: eventually we may want to commission a new node
- if (!hotNodes.isEmpty() && hotShards.isEmpty() && hotCollections.isEmpty() && hotReplicas.isEmpty()) {
- // move replicas around
- if (aboveNodeOp != null) {
- hotNodes.forEach((n, r) -> {
- ops.add(new TriggerEvent.Op(aboveNodeOp, Suggester.Hint.SRC_NODE, n));
- });
- }
- } else {
- // add replicas
- Map<String, Map<String, List<Pair<String, String>>>> hints = new HashMap<>();
-
- hotShards.forEach((coll, shards) -> shards.forEach((s, r) -> {
- List<Pair<String, String>> perShard = hints
- .computeIfAbsent(coll, c -> new HashMap<>())
- .computeIfAbsent(s, sh -> new ArrayList<>());
- addReplicaHints(coll, s, r, searchableReplicationFactors.get(coll).get(s).get(), perShard);
- }));
- hotReplicas.forEach(ri -> {
- double r = (Double)ri.getVariable(AutoScalingParams.RATE);
- // add only if not already accounted for in hotShards
- List<Pair<String, String>> perShard = hints
- .computeIfAbsent(ri.getCollection(), c -> new HashMap<>())
- .computeIfAbsent(ri.getShard(), sh -> new ArrayList<>());
- if (perShard.isEmpty()) {
- addReplicaHints(ri.getCollection(), ri.getShard(), r, searchableReplicationFactors.get(ri.getCollection()).get(ri.getShard()).get(), perShard);
+ if (!hotNodes.isEmpty()) {
+ if (hotShards.isEmpty() && hotCollections.isEmpty()) {
+ // move replicas around
+ if (aboveNodeOp != null) {
+ hotNodes.forEach((n, r) -> {
+ ops.add(new TriggerEvent.Op(aboveNodeOp, Suggester.Hint.SRC_NODE, n));
+ violations.add(HOT_NODES);
+ });
}
- });
-
- hints.values().forEach(m -> m.values().forEach(lst -> lst.forEach(p -> {
- ops.add(new TriggerEvent.Op(aboveOp, Suggester.Hint.COLL_SHARD, p));
- })));
+ } else {
+ // ignore - hot shards will result in changes that will change hot node status anyway
+ }
}
+ // add replicas
+ Map<String, Map<String, List<Pair<String, String>>>> hints = new HashMap<>();
+
+ // HOT COLLECTIONS
+ // currently we don't do anything for hot collections. Theoretically we could add
+ // 1 replica more to each shard, based on how close to the threshold each shard is
+ // but it's probably better to wait for a shard to become hot and be more precise.
+
+ // HOT SHARDS
+
+ hotShards.forEach((coll, shards) -> shards.forEach((s, r) -> {
+ List<Pair<String, String>> perShard = hints
+ .computeIfAbsent(coll, c -> new HashMap<>())
+ .computeIfAbsent(s, sh -> new ArrayList<>());
+ addReplicaHints(coll, s, r, searchableReplicationFactors.get(coll).get(s).get(), perShard);
+ violations.add(HOT_SHARDS);
+ }));
+
+ // HOT REPLICAS
+ // Hot replicas (while their shards are not hot) may be caused by
+ // dumb clients that use direct replica URLs - this is beyond our control
+ // so ignore them.
+
+ hints.values().forEach(m -> m.values().forEach(lst -> lst.forEach(p -> {
+ ops.add(new TriggerEvent.Op(aboveOp, Suggester.Hint.COLL_SHARD, p));
+ })));
}
@@ -601,6 +665,7 @@ public class SearchRateTrigger extends TriggerBase {
}
private void calculateColdOps(List<TriggerEvent.Op> ops,
+ Set<String> violations,
ClusterState clusterState,
Map<String, Map<String, AtomicInteger>> searchableReplicationFactors,
Map<String, Double> coldNodes,
@@ -611,12 +676,13 @@ public class SearchRateTrigger extends TriggerBase {
// Probably can't do anything reasonable about whole cold collections
// because they may be needed even if not used.
- // COLD SHARDS:
- // Cold shards mean that there are too many replicas per shard - but it also
- // means that all replicas in these shards are cold too, so we can simply
- // address this by deleting cold replicas
+ // COLD SHARDS & COLD REPLICAS:
+ // We remove cold replicas only from cold shards, otherwise we are susceptible to uneven
+ // replica routing (which is beyond our control).
+ // If we removed replicas from non-cold shards we could accidentally bring that shard into
+ // the hot range, which would result in adding replica, and that replica could again stay cold due to
+ // the same routing issue, which then would lead to removing that replica, etc, etc...
- // COLD REPLICAS:
// Remove cold replicas but only when there's at least a minimum number of searchable
// replicas still available (additional non-searchable replicas may exist, too)
// NOTE: do this before adding ops for DELETENODE because we don't want to attempt
@@ -627,11 +693,18 @@ public class SearchRateTrigger extends TriggerBase {
.computeIfAbsent(ri.getShard(), s -> new ArrayList<>())
.add(ri);
});
- byCollectionByShard.forEach((coll, shards) -> {
- shards.forEach((shard, replicas) -> {
+ coldShards.forEach((coll, perShard) -> {
+ perShard.forEach((shard, rate) -> {
+ List<ReplicaInfo> replicas = byCollectionByShard
+ .getOrDefault(coll, Collections.emptyMap())
+ .getOrDefault(shard, Collections.emptyList());
+ if (replicas.isEmpty()) {
+ return;
+ }
// only delete if there's at least minRF searchable replicas left
int rf = searchableReplicationFactors.get(coll).get(shard).get();
- // we only really need a leader and we may be allowed to remove other replicas
+ // assume first that we only really need a leader and we may be
+ // allowed to remove other replicas
int minRF = 1;
// but check the official RF and don't go below that
Integer RF = clusterState.getCollection(coll).getReplicationFactor();
@@ -660,6 +733,7 @@ public class SearchRateTrigger extends TriggerBase {
Suggester.Hint.COLL_SHARD, new Pair<>(ri.getCollection(), ri.getShard()));
op.addHint(Suggester.Hint.REPLICA, ri.getName());
ops.add(op);
+ violations.add(COLD_SHARDS);
limit.decrementAndGet();
});
}
@@ -669,7 +743,7 @@ public class SearchRateTrigger extends TriggerBase {
// COLD NODES:
// Unlike the case of hot nodes, if a node is cold then any monitored
// collections / shards / replicas located on that node are cold, too.
- // HOWEVER, we check only non-pull replicas and only from selected collections / shards,
+ // HOWEVER, we check only replicas from selected collections / shards,
// so deleting a cold node is dangerous because it may interfere with these
// non-monitored resources - this is the reason the default belowNodeOp is null / ignored.
//
@@ -679,6 +753,7 @@ public class SearchRateTrigger extends TriggerBase {
if (belowNodeOp != null) {
coldNodes.forEach((node, rate) -> {
ops.add(new TriggerEvent.Op(belowNodeOp, Suggester.Hint.SRC_NODE, node));
+ violations.add(COLD_NODES);
});
}
@@ -688,7 +763,7 @@ public class SearchRateTrigger extends TriggerBase {
private boolean waitForElapsed(String name, long now, Map<String, Long> lastEventMap) {
Long lastTime = lastEventMap.computeIfAbsent(name, s -> now);
long elapsed = TimeUnit.SECONDS.convert(now - lastTime, TimeUnit.NANOSECONDS);
- log.trace("name={}, lastTime={}, elapsed={}", name, lastTime, elapsed);
+ log.trace("name={}, lastTime={}, elapsed={}, waitFor={}", name, lastTime, elapsed, getWaitForSecond());
if (TimeUnit.SECONDS.convert(now - lastTime, TimeUnit.NANOSECONDS) < getWaitForSecond()) {
return false;
}
@@ -704,7 +779,8 @@ public class SearchRateTrigger extends TriggerBase {
Map<String, Double> coldNodes,
Map<String, Double> coldCollections,
Map<String, Map<String, Double>> coldShards,
- List<ReplicaInfo> coldReplicas) {
+ List<ReplicaInfo> coldReplicas,
+ Set<String> violations) {
super(TriggerEventType.SEARCHRATE, source, eventTime, null);
properties.put(TriggerEvent.REQUESTED_OPS, ops);
properties.put(HOT_NODES, hotNodes);
@@ -715,6 +791,7 @@ public class SearchRateTrigger extends TriggerBase {
properties.put(COLD_COLLECTIONS, coldCollections);
properties.put(COLD_SHARDS, coldShards);
properties.put(COLD_REPLICAS, coldReplicas);
+ properties.put(VIOLATION_PROP, violations);
}
}
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionException.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionException.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionException.java
new file mode 100644
index 0000000..624ce68
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionException.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.cloud.autoscaling;
+
+/**
+ * Trigger action-specific exception.
+ */
+public class TriggerActionException extends Exception {
+
+ public final String triggerName;
+ public final String actionName;
+
+ public TriggerActionException(String triggerName, String actionName, String message, Throwable cause) {
+ super(message, cause);
+ this.triggerName = triggerName;
+ this.actionName = actionName;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java b/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java
index d277e31..d8fe78b 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java
@@ -53,7 +53,7 @@ public class ActionThrottleTest extends SolrTestCaseJ4 {
@Override
public void sleep(long ms) throws InterruptedException {
- throw new UnsupportedOperationException();
+ TimeSource.NANO_TIME.sleep(ms);
}
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java b/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java
index 448889f..b67b551 100644
--- a/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java
+++ b/solr/core/src/test/org/apache/solr/cloud/CloudTestUtils.java
@@ -20,6 +20,7 @@ package org.apache.solr.cloud;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Collection;
+import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
@@ -118,7 +119,7 @@ public class CloudTestUtils {
* @param expectedReplicas expected number of active replicas
*/
public static CollectionStatePredicate clusterShape(int expectedShards, int expectedReplicas) {
- return clusterShape(expectedShards, expectedReplicas, false);
+ return clusterShape(expectedShards, expectedReplicas, false, false);
}
/**
@@ -129,30 +130,46 @@ public class CloudTestUtils {
* @param expectedShards expected number of shards
* @param expectedReplicas expected number of active replicas
* @param withInactive if true then count also inactive shards
+ * @param requireLeaders if true then require that each shard has a leader
*/
- public static CollectionStatePredicate clusterShape(int expectedShards, int expectedReplicas, boolean withInactive) {
+ public static CollectionStatePredicate clusterShape(int expectedShards, int expectedReplicas, boolean withInactive,
+ boolean requireLeaders) {
return (liveNodes, collectionState) -> {
if (collectionState == null) {
- log.debug("-- null collection");
+ log.trace("-- null collection");
return false;
}
Collection<Slice> slices = withInactive ? collectionState.getSlices() : collectionState.getActiveSlices();
if (slices.size() != expectedShards) {
- log.debug("-- wrong number of active slices, expected=" + expectedShards + ", found=" + collectionState.getSlices().size());
+ log.trace("-- wrong number of active slices, expected={}, found={}", expectedShards, collectionState.getSlices().size());
return false;
}
+ Set<String> leaderless = new HashSet<>();
for (Slice slice : slices) {
int activeReplicas = 0;
+ if (requireLeaders && slice.getLeader() == null) {
+ leaderless.add(slice.getName());
+ continue;
+ }
+ // skip other checks, we're going to fail anyway
+ if (!leaderless.isEmpty()) {
+ continue;
+ }
for (Replica replica : slice) {
if (replica.isActive(liveNodes))
activeReplicas++;
}
if (activeReplicas != expectedReplicas) {
- log.debug("-- wrong number of active replicas in slice " + slice.getName() + ", expected=" + expectedReplicas + ", found=" + activeReplicas);
+ log.trace("-- wrong number of active replicas in slice {}, expected={}, found={}", slice.getName(), expectedReplicas, activeReplicas);
return false;
}
}
- return true;
+ if (leaderless.isEmpty()) {
+ return true;
+ } else {
+ log.trace("-- shards without leaders: {}", leaderless);
+ return false;
+ }
};
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java
index daf9867..e591c57 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/IndexSizeTriggerTest.java
@@ -67,7 +67,6 @@ import static org.apache.solr.common.cloud.ZkStateReader.SOLR_AUTOSCALING_CONF_P
*
*/
@LogLevel("org.apache.solr.cloud.autoscaling=DEBUG")
-//05-Jul-2018 @LuceneTestCase.BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12392")
public class IndexSizeTriggerTest extends SolrCloudTestCase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@@ -116,6 +115,10 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
if (cloudManager instanceof SimCloudManager) {
log.info(((SimCloudManager) cloudManager).dumpClusterState(true));
((SimCloudManager) cloudManager).getSimClusterStateProvider().simDeleteAllCollections();
+ ((SimCloudManager) cloudManager).simClearSystemCollection();
+ ((SimCloudManager) cloudManager).getSimClusterStateProvider().simResetLeaderThrottles();
+ ((SimCloudManager) cloudManager).simRestartOverseer(null);
+ cloudManager.getTimeSource().sleep(500);
((SimCloudManager) cloudManager).simResetOpCounts();
} else {
cluster.deleteAllCollections();
@@ -137,14 +140,14 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
}
@Test
- @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
+ //@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
public void testTrigger() throws Exception {
String collectionName = "testTrigger_collection";
CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionName,
"conf", 2, 2).setMaxShardsPerNode(2);
create.process(solrClient);
CloudTestUtils.waitForState(cloudManager, "failed to create " + collectionName, collectionName,
- CloudTestUtils.clusterShape(2, 2));
+ CloudTestUtils.clusterShape(2, 2, false, true));
long waitForSeconds = 3 + random().nextInt(5);
Map<String, Object> props = createTriggerProps(waitForSeconds);
@@ -235,14 +238,14 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
}
@Test
- @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
+ //@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
public void testSplitIntegration() throws Exception {
String collectionName = "testSplitIntegration_collection";
CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionName,
"conf", 2, 2).setMaxShardsPerNode(2);
create.process(solrClient);
CloudTestUtils.waitForState(cloudManager, "failed to create " + collectionName, collectionName,
- CloudTestUtils.clusterShape(2, 2));
+ CloudTestUtils.clusterShape(2, 2, false, true));
long waitForSeconds = 3 + random().nextInt(5);
// add disabled trigger
@@ -310,7 +313,7 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
boolean await = finished.await(60000 / SPEED, TimeUnit.MILLISECONDS);
assertTrue("did not finish processing in time", await);
- CloudTestUtils.waitForState(cloudManager, collectionName, 20, TimeUnit.SECONDS, CloudTestUtils.clusterShape(6, 2, true));
+ CloudTestUtils.waitForState(cloudManager, collectionName, 20, TimeUnit.SECONDS, CloudTestUtils.clusterShape(6, 2, true, true));
assertEquals(1, listenerEvents.size());
List<CapturedEvent> events = listenerEvents.get("capturing");
assertNotNull("'capturing' events not found", events);
@@ -348,14 +351,14 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
}
@Test
- @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
+ //@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
public void testMergeIntegration() throws Exception {
String collectionName = "testMergeIntegration_collection";
CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionName,
"conf", 2, 2).setMaxShardsPerNode(2);
create.process(solrClient);
CloudTestUtils.waitForState(cloudManager, "failed to create " + collectionName, collectionName,
- CloudTestUtils.clusterShape(2, 2));
+ CloudTestUtils.clusterShape(2, 2, false, true));
for (int i = 0; i < 10; i++) {
SolrInputDocument doc = new SolrInputDocument("id", "id-" + (i * 100));
@@ -458,7 +461,7 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
}
@Test
- @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
+ //@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 05-Jul-2018
public void testMixedBounds() throws Exception {
// if (cloudManager instanceof SimCloudManager) {
// log.warn("Requires SOLR-12208");
@@ -470,7 +473,7 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
"conf", 2, 2).setMaxShardsPerNode(2);
create.process(solrClient);
CloudTestUtils.waitForState(cloudManager, "failed to create " + collectionName, collectionName,
- CloudTestUtils.clusterShape(2, 2));
+ CloudTestUtils.clusterShape(2, 2, false, true));
for (int j = 0; j < 10; j++) {
UpdateRequest ureq = new UpdateRequest();
@@ -583,7 +586,7 @@ public class IndexSizeTriggerTest extends SolrCloudTestCase {
// collection should have 2 inactive and 4 active shards
CloudTestUtils.waitForState(cloudManager, "failed to create " + collectionName, collectionName,
- CloudTestUtils.clusterShape(6, 2, true));
+ CloudTestUtils.clusterShape(6, 2, true, true));
// check ops
List<TriggerEvent.Op> ops = (List<TriggerEvent.Op>) events.get(4).event.getProperty(TriggerEvent.REQUESTED_OPS);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java
index 164db8f..56896a7 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/ScheduledMaintenanceTriggerTest.java
@@ -165,7 +165,7 @@ public class ScheduledMaintenanceTriggerTest extends SolrCloudTestCase {
.setShardName("shard1");
split1.process(solrClient);
CloudTestUtils.waitForState(cloudManager, "failed to split " + collection1, collection1,
- CloudTestUtils.clusterShape(3, 1, true));
+ CloudTestUtils.clusterShape(3, 1, true, true));
String setListenerCommand = "{" +
"'set-listener' : " +
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java
index f8492d9..19ac4b3 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java
@@ -235,9 +235,8 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
assertTrue(TimeUnit.SECONDS.convert(waitForSeconds, TimeUnit.NANOSECONDS) - WAIT_FOR_DELTA_NANOS <= now - ev.event.getEventTime());
Map<String, Double> nodeRates = (Map<String, Double>) ev.event.getProperties().get(SearchRateTrigger.HOT_NODES);
assertNotNull("nodeRates", nodeRates);
- assertTrue(nodeRates.toString(), nodeRates.size() > 0);
- AtomicDouble totalNodeRate = new AtomicDouble();
- nodeRates.forEach((n, r) -> totalNodeRate.addAndGet(r));
+ // no node violations because node rates weren't set in the config
+ assertTrue(nodeRates.toString(), nodeRates.isEmpty());
List<ReplicaInfo> replicaRates = (List<ReplicaInfo>) ev.event.getProperties().get(SearchRateTrigger.HOT_REPLICAS);
assertNotNull("replicaRates", replicaRates);
assertTrue(replicaRates.toString(), replicaRates.size() > 0);
@@ -260,8 +259,8 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
Double collectionRate = collectionRates.get(COLL1);
assertNotNull(collectionRate);
assertTrue(collectionRate > 5.0);
- assertEquals(collectionRate, totalNodeRate.get(), 5.0);
- assertEquals(collectionRate, totalShardRate.get(), 5.0);
+ // two replicas - the trigger calculates average over all searchable replicas
+ assertEquals(collectionRate / 2, totalShardRate.get(), 5.0);
assertEquals(collectionRate, totalReplicaRate.get(), 5.0);
// check operations
@@ -302,7 +301,11 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
"'enabled' : false," +
"'collections' : '" + COLL1 + "'," +
"'aboveRate' : 1.0," +
- "'belowRate' : 0.1," +
+ // RecoveryStrategy calls /admin/ping, which calls /select so this may not be zero
+ // even when no external requests were made
+ "'belowRate' : 0.3," +
+ "'aboveNodeRate' : 1.0," +
+ "'belowNodeRate' : 0.3," +
// do nothing but generate an op
"'belowNodeOp' : 'none'," +
"'actions' : [" +
@@ -383,7 +386,7 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
- timeSource.sleep(5000);
+ timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS));
List<CapturedEvent> events = listenerEvents.get("srt");
assertEquals(events.toString(), 3, events.size());
@@ -442,13 +445,19 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
assertEquals(ev.toString(), "compute", ev.actionName);
ops = (List<TriggerEvent.Op>)ev.event.getProperty(TriggerEvent.REQUESTED_OPS);
assertNotNull("there should be some requestedOps: " + ev.toString(), ops);
- assertEquals(ops.toString(), 1, ops.size());
+ assertEquals(ops.toString(), 2, ops.size());
assertEquals(ops.toString(), CollectionParams.CollectionAction.NONE, ops.get(0).getAction());
+ assertEquals(ops.toString(), CollectionParams.CollectionAction.NONE, ops.get(1).getAction());
+
+ // wait for waitFor to elapse for all types of violations
+ timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds * 2, TimeUnit.SECONDS));
listenerEvents.clear();
finished = new CountDownLatch(1);
started = new CountDownLatch(1);
+ log.info("## test single replicas.");
+
// now allow single replicas
setTriggerCommand = "{" +
"'set-trigger' : {" +
@@ -458,7 +467,9 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
"'enabled' : true," +
"'collections' : '" + COLL1 + "'," +
"'aboveRate' : 1.0," +
- "'belowRate' : 0.1," +
+ "'belowRate' : 0.3," +
+ "'aboveNodeRate' : 1.0," +
+ "'belowNodeRate' : 0.3," +
"'minReplicas' : 1," +
"'belowNodeOp' : 'none'," +
"'actions' : [" +
@@ -491,21 +502,20 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
assertEquals(ev.toString(), "compute", ev.actionName);
ops = (List<TriggerEvent.Op>)ev.event.getProperty(TriggerEvent.REQUESTED_OPS);
assertNotNull("there should be some requestedOps: " + ev.toString(), ops);
- assertEquals(ops.toString(), 2, ops.size());
+
+ assertTrue(ops.toString(), ops.size() > 0);
AtomicInteger coldNodes2 = new AtomicInteger();
- AtomicInteger coldReplicas2 = new AtomicInteger();
ops.forEach(op -> {
if (op.getAction().equals(CollectionParams.CollectionAction.NONE)) {
coldNodes2.incrementAndGet();
} else if (op.getAction().equals(CollectionParams.CollectionAction.DELETEREPLICA)) {
- coldReplicas2.incrementAndGet();
+ // ignore
} else {
fail("unexpected op: " + op);
}
});
- assertEquals("coldNodes", 1, coldNodes2.get());
- assertEquals("colReplicas", 1, coldReplicas2.get());
+ assertEquals("coldNodes: " +ops.toString(), 2, coldNodes2.get());
// now the collection should be at RF == 1, with one additional PULL replica
CloudTestUtils.waitForState(cloudManager, COLL1, 60, TimeUnit.SECONDS,
@@ -543,6 +553,9 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
"'collections' : '" + COLL1 + "'," +
"'aboveRate' : 1.0," +
"'belowRate' : 0.1," +
+ // set limits to node rates
+ "'aboveNodeRate' : 1.0," +
+ "'belowNodeRate' : 0.1," +
// allow deleting all spare replicas
"'minReplicas' : 1," +
// allow requesting all deletions in one event
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java
index 60b973f..0e9f4fb 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerTest.java
@@ -174,11 +174,11 @@ public class SearchRateTriggerTest extends SolrCloudTestCase {
}
Thread.sleep(waitForSeconds * 1000);
trigger.run();
- // should generate node and collection event but not for COLL2 because of waitFor
+ // should generate collection event but not for COLL2 because of waitFor
assertEquals(1, events.size());
event = events.get(0);
Map<String, Double> hotNodes = (Map<String, Double>)event.getProperty(SearchRateTrigger.HOT_NODES);
- assertEquals(3, hotNodes.size());
+ assertTrue("hotNodes", hotNodes.isEmpty());
hotNodes.forEach((n, r) -> assertTrue(n, r > rate));
hotCollections = (Map<String, Double>)event.getProperty(SearchRateTrigger.HOT_COLLECTIONS);
assertEquals(1, hotCollections.size());
@@ -193,7 +193,7 @@ public class SearchRateTriggerTest extends SolrCloudTestCase {
Thread.sleep(waitForSeconds * 1000 * 2);
trigger.run();
- // should generate node and collection event
+ // should generate collection event
assertEquals(1, events.size());
event = events.get(0);
hotCollections = (Map<String, Double>)event.getProperty(SearchRateTrigger.HOT_COLLECTIONS);
@@ -203,8 +203,7 @@ public class SearchRateTriggerTest extends SolrCloudTestCase {
Rate = hotCollections.get(COLL2);
assertNotNull(Rate);
hotNodes = (Map<String, Double>)event.getProperty(SearchRateTrigger.HOT_NODES);
- assertEquals(3, hotNodes.size());
- hotNodes.forEach((n, r) -> assertTrue(n, r > rate));
+ assertTrue("hotNodes", hotNodes.isEmpty());
}
}
@@ -266,7 +265,7 @@ public class SearchRateTriggerTest extends SolrCloudTestCase {
hotCollections = (Map<String, Object>)event.properties.get(SearchRateTrigger.HOT_COLLECTIONS);
hotShards = (Map<String, Object>)event.properties.get(SearchRateTrigger.HOT_SHARDS);
hotReplicas = (List<ReplicaInfo>)event.properties.get(SearchRateTrigger.HOT_REPLICAS);
- assertFalse("no hot nodes?", hotNodes.isEmpty());
+ assertTrue("no hot nodes?", hotNodes.isEmpty());
assertFalse("no hot collections?", hotCollections.isEmpty());
assertFalse("no hot shards?", hotShards.isEmpty());
assertFalse("no hot replicas?", hotReplicas.isEmpty());
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
index abc3ccf..741a868 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
@@ -53,6 +53,7 @@ import org.apache.solr.client.solrj.impl.ClusterStateProvider;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.cloud.ActionThrottle;
+import org.apache.solr.cloud.CloudTestUtils;
import org.apache.solr.cloud.Overseer;
import org.apache.solr.cloud.api.collections.AddReplicaCmd;
import org.apache.solr.cloud.api.collections.Assign;
@@ -143,6 +144,8 @@ public class SimClusterStateProvider implements ClusterStateProvider {
private AtomicReference<Map<String, DocCollection>> collectionsStatesRef = new AtomicReference<>();
private AtomicBoolean saveClusterState = new AtomicBoolean();
+ private transient boolean closed;
+
/**
* The instance needs to be initialized using the <code>sim*</code> methods in order
* to ensure proper behavior, otherwise it will behave as a cluster with zero replicas.
@@ -243,7 +246,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
return null;
}
for (ReplicaInfo ri : list) {
- if (r.getName().equals(ri.getName())) {
+ if (r.getCoreName().equals(ri.getCore())) {
return ri;
}
}
@@ -255,6 +258,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
* @param nodeId unique node id
*/
public void simAddNode(String nodeId) throws Exception {
+ ensureNotClosed();
if (liveNodes.contains(nodeId)) {
throw new Exception("Node " + nodeId + " already exists");
}
@@ -271,6 +275,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
* @return true if a node existed and was removed
*/
public boolean simRemoveNode(String nodeId) throws Exception {
+ ensureNotClosed();
lock.lockInterruptibly();
try {
Set<String> collections = new HashSet<>();
@@ -346,6 +351,9 @@ public class SimClusterStateProvider implements ClusterStateProvider {
if (replicas != null) {
replicas.forEach(r -> {
r.getVariables().put(ZkStateReader.STATE_PROP, state.toString());
+ if (state != Replica.State.ACTIVE) {
+ r.getVariables().remove(ZkStateReader.LEADER_PROP);
+ }
changedCollections.add(r.getCollection());
});
}
@@ -433,6 +441,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
* @param runLeaderElection if true then run a leader election after adding the replica.
*/
public void simAddReplica(String nodeId, ReplicaInfo replicaInfo, boolean runLeaderElection) throws Exception {
+ ensureNotClosed();
// make sure SolrCore name is unique across cluster and coreNodeName within collection
for (Map.Entry<String, List<ReplicaInfo>> e : nodeReplicaMap.entrySet()) {
for (ReplicaInfo ri : e.getValue()) {
@@ -468,8 +477,6 @@ public class SimClusterStateProvider implements ClusterStateProvider {
opDelay(replicaInfo.getCollection(), CollectionParams.CollectionAction.ADDREPLICA.name());
- // at this point nuke our cached DocCollection state
- collectionsStatesRef.set(null);
List<ReplicaInfo> replicas = nodeReplicaMap.computeIfAbsent(nodeId, n -> new ArrayList<>());
// mark replica as active
replicaInfo.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.ACTIVE.toString());
@@ -481,7 +488,6 @@ public class SimClusterStateProvider implements ClusterStateProvider {
}
replicas.add(replicaInfo);
- LOG.trace("-- simAddReplica {}", replicaInfo);
Map<String, Object> values = cloudManager.getSimNodeStateProvider().simGetAllNodeValues()
.computeIfAbsent(nodeId, id -> new ConcurrentHashMap<>(SimCloudManager.createNodeValues(id)));
@@ -505,6 +511,9 @@ public class SimClusterStateProvider implements ClusterStateProvider {
cloudManager.getMetricManager().registerGauge(null, registry,
() -> replicaInfo.getVariable(Type.CORE_IDX.metricsAttribute),
"", true, "INDEX.sizeInBytes");
+ // at this point nuke our cached DocCollection state
+ collectionsStatesRef.set(null);
+ LOG.trace("-- simAddReplica {}", replicaInfo);
if (runLeaderElection) {
simRunLeaderElection(Collections.singleton(replicaInfo.getCollection()), true);
}
@@ -519,6 +528,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
* @param coreNodeName coreNodeName
*/
public void simRemoveReplica(String nodeId, String coreNodeName) throws Exception {
+ ensureNotClosed();
lock.lockInterruptibly();
List<ReplicaInfo> replicas = nodeReplicaMap.computeIfAbsent(nodeId, n -> new ArrayList<>());
try {
@@ -591,20 +601,30 @@ public class SimClusterStateProvider implements ClusterStateProvider {
* @param saveClusterState if true then save cluster state regardless of changes.
*/
private void simRunLeaderElection(Collection<String> collections, boolean saveClusterState) throws Exception {
- ClusterState state = getClusterState();
+ ensureNotClosed();
if (saveClusterState) {
collectionsStatesRef.set(null);
}
+ ClusterState state = getClusterState();
state.forEachCollection(dc -> {
if (!collections.contains(dc.getName())) {
return;
}
- dc.getSlices().forEach(s ->
+ dc.getSlices().forEach(s -> {
+ if (s.getLeader() != null) {
+ LOG.debug("-- already has leader {} / {}", dc.getName(), s.getName());
+ return;
+ }
+ if (s.getReplicas().isEmpty()) {
+ LOG.debug("-- no replicas in {} / {}", dc.getName(), s.getName());
+ return;
+ }
+ LOG.debug("-- submit leader election for {} / {}", dc.getName(), s.getName());
cloudManager.submit(() -> {
simRunLeaderElection(dc.getName(), s, saveClusterState);
return true;
- })
- );
+ });
+ });
});
}
@@ -612,59 +632,73 @@ public class SimClusterStateProvider implements ClusterStateProvider {
AtomicBoolean stateChanged = new AtomicBoolean(Boolean.FALSE);
Replica leader = s.getLeader();
if (leader == null || !liveNodes.contains(leader.getNodeName())) {
- LOG.trace("Running leader election for " + collection + " / " + s.getName());
+ LOG.debug("Running leader election for {} / {}", collection, s.getName());
if (s.getReplicas().isEmpty()) { // no replicas - punt
+ LOG.debug("-- no replicas in {} / {}", collection, s.getName());
return;
}
ActionThrottle lt = getThrottle(collection, s.getName());
- lt.minimumWaitBetweenActions();
- lt.markAttemptingAction();
-
- // mark all replicas as non-leader (probably not necessary) and collect all active and live
- List<ReplicaInfo> active = new ArrayList<>();
- s.getReplicas().forEach(r -> {
- // find our ReplicaInfo for this replica
- ReplicaInfo ri = getReplicaInfo(r);
- if (ri == null) {
- throw new IllegalStateException("-- could not find ReplicaInfo for replica " + r);
- }
- synchronized (ri) {
- if (ri.getVariables().remove(ZkStateReader.LEADER_PROP) != null) {
- stateChanged.set(true);
+ synchronized (lt) {
+ // collect all active and live
+ List<ReplicaInfo> active = new ArrayList<>();
+ AtomicBoolean alreadyHasLeader = new AtomicBoolean(false);
+ s.getReplicas().forEach(r -> {
+ // find our ReplicaInfo for this replica
+ ReplicaInfo ri = getReplicaInfo(r);
+ if (ri == null) {
+ throw new IllegalStateException("-- could not find ReplicaInfo for replica " + r);
}
- if (r.isActive(liveNodes.get())) {
- active.add(ri);
- } else { // if it's on a node that is not live mark it down
- if (!liveNodes.contains(r.getNodeName())) {
- ri.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.DOWN.toString());
- stateChanged.set(true);
+ synchronized (ri) {
+ if (r.isActive(liveNodes.get())) {
+ if (ri.getVariables().get(ZkStateReader.LEADER_PROP) != null) {
+ LOG.trace("-- found existing leader {} / {}: {}, {}", collection, s.getName(), ri, r);
+ alreadyHasLeader.set(true);
+ return;
+ } else {
+ active.add(ri);
+ }
+ } else { // if it's on a node that is not live mark it down
+ LOG.trace("-- replica not active on live nodes: {}, {}", liveNodes.get(), r);
+ if (!liveNodes.contains(r.getNodeName())) {
+ ri.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.DOWN.toString());
+ ri.getVariables().remove(ZkStateReader.LEADER_PROP);
+ stateChanged.set(true);
+ }
}
}
+ });
+ if (alreadyHasLeader.get()) {
+ LOG.debug("-- already has leader {} / {}: {}", collection, s.getName(), s);
+ return;
}
- });
- if (active.isEmpty()) {
- LOG.warn("-- can't find any active replicas for " + collection + " / " + s.getName());
- return;
- }
- // pick first active one
- ReplicaInfo ri = null;
- for (ReplicaInfo a : active) {
- if (!a.getType().equals(Replica.Type.PULL)) {
- ri = a;
- break;
+ if (active.isEmpty()) {
+ LOG.warn("Can't find any active replicas for {} / {}: {}", collection, s.getName(), s);
+ LOG.debug("-- liveNodes: {}", liveNodes.get());
+ return;
}
+ // pick first active one
+ ReplicaInfo ri = null;
+ for (ReplicaInfo a : active) {
+ if (!a.getType().equals(Replica.Type.PULL)) {
+ ri = a;
+ break;
+ }
+ }
+ if (ri == null) {
+ LOG.warn("-- can't find any suitable replica type for {} / {}: {}", collection, s.getName(), s);
+ return;
+ }
+ // now mark the leader election throttle
+ lt.minimumWaitBetweenActions();
+ lt.markAttemptingAction();
+ synchronized (ri) {
+ ri.getVariables().put(ZkStateReader.LEADER_PROP, "true");
+ }
+ stateChanged.set(true);
+ LOG.debug("-- elected new leader for " + collection + " / " + s.getName() + ": " + ri.getName());
}
- if (ri == null) {
- LOG.warn("-- can't find any suitable replica type for " + collection + " / " + s.getName());
- return;
- }
- synchronized (ri) {
- ri.getVariables().put(ZkStateReader.LEADER_PROP, "true");
- }
- stateChanged.set(true);
- LOG.debug("-- elected new leader for " + collection + " / " + s.getName() + ": " + ri);
} else {
- LOG.trace("-- already has leader for {} / {}", collection, s.getName());
+ LOG.debug("-- already has leader for {} / {}", collection, s.getName());
}
if (stateChanged.get() || saveState) {
collectionsStatesRef.set(null);
@@ -678,6 +712,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
* @param results results of the operation.
*/
public void simCreateCollection(ZkNodeProps props, NamedList results) throws Exception {
+ ensureNotClosed();
if (props.getStr(CommonAdminParams.ASYNC) != null) {
results.add(CoreAdminParams.REQUESTID, props.getStr(CommonAdminParams.ASYNC));
}
@@ -781,7 +816,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
ReplicaInfo ri = new ReplicaInfo("core_node" + Assign.incAndGetId(stateManager, collectionName, 0),
coreName, collectionName, pos.shard, pos.type, pos.node, replicaProps);
cloudManager.submit(() -> {
- simAddReplica(pos.node, ri, false);
+ simAddReplica(pos.node, ri, true);
finalStateLatch.countDown();
return true;
});
@@ -814,7 +849,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
// force recreation of collection states
collectionsStatesRef.set(null);
- simRunLeaderElection(Collections.singleton(collectionName), true);
+ //simRunLeaderElection(Collections.singleton(collectionName), true);
if (waitForFinalState) {
boolean finished = finalStateLatch.await(cloudManager.getTimeSource().convertDelay(TimeUnit.SECONDS, 60, TimeUnit.MILLISECONDS),
TimeUnit.MILLISECONDS);
@@ -833,6 +868,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
* @param results results of the operation
*/
public void simDeleteCollection(String collection, String async, NamedList results) throws Exception {
+ ensureNotClosed();
if (async != null) {
results.add(CoreAdminParams.REQUESTID, async);
}
@@ -903,6 +939,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
* @param results operation results.
*/
public void simMoveReplica(ZkNodeProps message, NamedList results) throws Exception {
+ ensureNotClosed();
if (message.getStr(CommonAdminParams.ASYNC) != null) {
results.add(CoreAdminParams.REQUESTID, message.getStr(CommonAdminParams.ASYNC));
}
@@ -950,6 +987,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
* @param results operation results
*/
public void simCreateShard(ZkNodeProps message, NamedList results) throws Exception {
+ ensureNotClosed();
if (message.getStr(CommonAdminParams.ASYNC) != null) {
results.add(CoreAdminParams.REQUESTID, message.getStr(CommonAdminParams.ASYNC));
}
@@ -1017,6 +1055,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
* @param results operation results.
*/
public void simSplitShard(ZkNodeProps message, NamedList results) throws Exception {
+ ensureNotClosed();
if (message.getStr(CommonAdminParams.ASYNC) != null) {
results.add(CoreAdminParams.REQUESTID, message.getStr(CommonAdminParams.ASYNC));
}
@@ -1130,6 +1169,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
* @param results operation results
*/
public void simDeleteShard(ZkNodeProps message, NamedList results) throws Exception {
+ ensureNotClosed();
if (message.getStr(CommonAdminParams.ASYNC) != null) {
results.add(CoreAdminParams.REQUESTID, message.getStr(CommonAdminParams.ASYNC));
}
@@ -1180,6 +1220,8 @@ public class SimClusterStateProvider implements ClusterStateProvider {
CommonAdminParams.WAIT_FOR_FINAL_STATE, "true"
);
simCreateCollection(props, new NamedList());
+ CloudTestUtils.waitForState(cloudManager, CollectionAdminParams.SYSTEM_COLL, 20, TimeUnit.SECONDS,
+ CloudTestUtils.clusterShape(1, 1, false, true));
} catch (Exception e) {
throw new IOException(e);
}
@@ -1204,6 +1246,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
* @throws SolrException on errors, such as nonexistent collection or unsupported deleteByQuery
*/
public UpdateResponse simUpdate(UpdateRequest req) throws SolrException, InterruptedException, IOException {
+ ensureNotClosed();
String collection = req.getCollection();
if (collection == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Collection not set");
@@ -1699,6 +1742,17 @@ public class SimClusterStateProvider implements ClusterStateProvider {
@Override
public void close() throws IOException {
+ closed = true;
+ }
+ @Override
+ public boolean isClosed() {
+ return closed;
+ }
+
+ private void ensureNotClosed() throws IOException {
+ if (closed) {
+ throw new IOException("already closed");
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
index cb8640c..7a346ea 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
@@ -30,6 +30,8 @@ import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.ReentrantLock;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.solr.client.solrj.cloud.NodeStateProvider;
@@ -230,6 +232,7 @@ public class SimNodeStateProvider implements NodeStateProvider {
}
}
+ private static final Pattern REGISTRY_PATTERN = Pattern.compile("^solr\\.core\\.([\\w.-_]+?)\\.(shard[\\d_]+?)\\.(replica.*)");
/**
* Simulate getting replica metrics values. This uses per-replica properties set in
* {@link SimClusterStateProvider#simSetCollectionValue(String, String, Object, boolean, boolean)} and
@@ -257,14 +260,15 @@ public class SimNodeStateProvider implements NodeStateProvider {
// skip - this is probably solr.node or solr.jvm metric
continue;
}
- String[] collParts = parts[1].substring(10).split("\\.");
- if (collParts.length != 3) {
+ Matcher m = REGISTRY_PATTERN.matcher(parts[1]);
+
+ if (!m.matches()) {
LOG.warn("Invalid registry name: " + parts[1]);
continue;
}
- String collection = collParts[0];
- String shard = collParts[1];
- String replica = collParts[2];
+ String collection = m.group(1);
+ String shard = m.group(2);
+ String replica = m.group(3);
String key = parts.length > 3 ? parts[2] + ":" + parts[3] : parts[2];
replicas.forEach(r -> {
if (r.getCollection().equals(collection) && r.getShard().equals(shard) && r.getCore().endsWith(replica)) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java
index 9edf0ed..c1f3a56 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestComputePlanAction.java
@@ -147,7 +147,7 @@ public class TestComputePlanAction extends SimSolrCloudTestCase {
create.process(solrClient);
CloudTestUtils.waitForState(cluster, "Timed out waiting for replicas of new collection to be active",
- "testNodeLost", CloudTestUtils.clusterShape(1, 2));
+ "testNodeLost", CloudTestUtils.clusterShape(1, 2, false, true));
ClusterState clusterState = cluster.getClusterStateProvider().getClusterState();
log.debug("-- cluster state: {}", clusterState);
@@ -210,7 +210,7 @@ public class TestComputePlanAction extends SimSolrCloudTestCase {
create.process(solrClient);
CloudTestUtils.waitForState(cluster, "Timed out waiting for replicas of new collection to be active",
- "testNodeWithMultipleReplicasLost", CloudTestUtils.clusterShape(2, 3));
+ "testNodeWithMultipleReplicasLost", CloudTestUtils.clusterShape(2, 3, false, true));
ClusterState clusterState = cluster.getClusterStateProvider().getClusterState();
log.debug("-- cluster state: {}", clusterState);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java
index 36d72b8..796dc3b 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestExecutePlanAction.java
@@ -90,7 +90,7 @@ public class TestExecutePlanAction extends SimSolrCloudTestCase {
create.process(solrClient);
log.info("Collection ready after " + CloudTestUtils.waitForState(cluster, collectionName, 120, TimeUnit.SECONDS,
- CloudTestUtils.clusterShape(1, 2)) + "ms");
+ CloudTestUtils.clusterShape(1, 2, false, true)) + "ms");
String sourceNodeName = cluster.getSimClusterStateProvider().simGetRandomNode(random());
ClusterState clusterState = cluster.getClusterStateProvider().getClusterState();
@@ -152,7 +152,7 @@ public class TestExecutePlanAction extends SimSolrCloudTestCase {
}
log.info("Collection ready after " + CloudTestUtils.waitForState(cluster, collectionName, 300, TimeUnit.SECONDS,
- CloudTestUtils.clusterShape(1, 2)) + "ms");
+ CloudTestUtils.clusterShape(1, 2, false, true)) + "ms");
}
@Test
@@ -179,7 +179,7 @@ public class TestExecutePlanAction extends SimSolrCloudTestCase {
create.process(solrClient);
CloudTestUtils.waitForState(cluster, "Timed out waiting for replicas of new collection to be active",
- collectionName, CloudTestUtils.clusterShape(1, 2));
+ collectionName, CloudTestUtils.clusterShape(1, 2, false, true));
String sourceNodeName = cluster.getSimClusterStateProvider().simGetRandomNode(random());
ClusterState clusterState = cluster.getClusterStateProvider().getClusterState();
@@ -195,8 +195,10 @@ public class TestExecutePlanAction extends SimSolrCloudTestCase {
cluster.simRemoveNode(sourceNodeName, false);
+ cluster.getTimeSource().sleep(3000);
+
CloudTestUtils.waitForState(cluster, "Timed out waiting for replicas of collection to be 2 again",
- collectionName, CloudTestUtils.clusterShape(1, 2));
+ collectionName, CloudTestUtils.clusterShape(1, 2, false, true));
clusterState = cluster.getClusterStateProvider().getClusterState();
docCollection = clusterState.getCollection(collectionName);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
index 61fedb8..93f92ec 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
@@ -87,6 +87,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
static Map<String, List<CapturedEvent>> listenerEvents = new ConcurrentHashMap<>();
static AtomicInteger triggerFinishedCount = new AtomicInteger();
static AtomicInteger triggerStartedCount = new AtomicInteger();
+ static CountDownLatch triggerStartedLatch;
static CountDownLatch triggerFinishedLatch;
static int waitForSeconds;
@@ -100,6 +101,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
waitForSeconds = 5;
triggerStartedCount.set(0);
triggerFinishedCount.set(0);
+ triggerStartedLatch = new CountDownLatch(1);
triggerFinishedLatch = new CountDownLatch(1);
listenerEvents.clear();
// disable .scheduled_maintenance and .auto_add_replicas
@@ -108,20 +110,33 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
"}";
SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, suspendTriggerCommand);
SolrClient solrClient = cluster.simGetSolrClient();
- NamedList<Object> response = solrClient.request(req);
- assertEquals(response.get("result").toString(), "success");
+ NamedList<Object> response;
+ try {
+ response = solrClient.request(req);
+ assertEquals(response.get("result").toString(), "success");
+ } catch (Exception e) {
+ if (!e.toString().contains("No trigger exists")) {
+ throw e;
+ }
+ }
suspendTriggerCommand = "{" +
"'suspend-trigger' : {'name' : '.auto_add_replicas'}" +
"}";
req = createAutoScalingRequest(SolrRequest.METHOD.POST, suspendTriggerCommand);
- response = solrClient.request(req);
- assertEquals(response.get("result").toString(), "success");
+ try {
+ response = solrClient.request(req);
+ assertEquals(response.get("result").toString(), "success");
+ } catch (Exception e) {
+ if (!e.toString().contains("No trigger exists")) {
+ throw e;
+ }
+ }
// do this in advance if missing
if (!cluster.getSimClusterStateProvider().simListCollections().contains(CollectionAdminParams.SYSTEM_COLL)) {
cluster.getSimClusterStateProvider().createSystemCollection();
CloudTestUtils.waitForState(cluster, CollectionAdminParams.SYSTEM_COLL, 120, TimeUnit.SECONDS,
- CloudTestUtils.clusterShape(1, 1));
+ CloudTestUtils.clusterShape(1, 1, false, true));
}
}
@@ -150,6 +165,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
public static class StartTriggerAction extends TriggerActionBase {
@Override
public void process(TriggerEvent event, ActionContext context) throws Exception {
+ triggerStartedLatch.countDown();
triggerStartedCount.incrementAndGet();
}
}
@@ -212,7 +228,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
create.process(solrClient);
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS,
- CloudTestUtils.clusterShape(5, 15)) + "ms");
+ CloudTestUtils.clusterShape(5, 15, false, true)) + "ms");
int KILL_NODES = 8;
// kill off a number of nodes
@@ -221,7 +237,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
}
// should fully recover
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 90 * KILL_NODES, TimeUnit.SECONDS,
- CloudTestUtils.clusterShape(5, 15)) + "ms");
+ CloudTestUtils.clusterShape(5, 15, false, true)) + "ms");
log.info("OP COUNTS: " + cluster.simGetOpCounts());
long moveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name());
@@ -256,7 +272,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS,
- CloudTestUtils.clusterShape(5, 15)) + "ms");
+ CloudTestUtils.clusterShape(5, 15, false, true)) + "ms");
long newMoveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name());
log.info("==== Flaky replicas: {}. Additional MOVEREPLICA count: {}", flakyReplicas, (newMoveReplicaOps - moveReplicaOps));
// flaky nodes lead to a number of MOVEREPLICA that is non-zero but lower than the number of flaky replicas
@@ -295,7 +311,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
create.process(solrClient);
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
- CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3)) + " ms");
+ CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3, false, true)) + " ms");
// start adding nodes
int numAddNode = NUM_NODES / 5;
@@ -305,7 +321,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
cluster.getTimeSource().sleep(5000);
}
// wait until at least one event is generated
- boolean await = triggerFinishedLatch.await(10000 / SPEED, TimeUnit.MILLISECONDS);
+ boolean await = triggerStartedLatch.await(20000 / SPEED, TimeUnit.MILLISECONDS);
assertTrue("trigger did not fire", await);
// wait until started == finished
@@ -340,7 +356,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
assertTrue("no MOVEREPLICA ops?", cluster.simGetOpCount("MOVEREPLICA") > 0);
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
- CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3)) + " ms");
+ CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3, false, true)) + " ms");
int count = 50;
SolrInputDocument finishedEvent = null;
@@ -471,6 +487,20 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
+ String setListenerCommand = "{" +
+ "'set-listener' : " +
+ "{" +
+ "'name' : 'failures'," +
+ "'trigger' : 'node_lost_trigger3'," +
+ "'stage' : ['FAILED']," +
+ "'class' : '" + TestTriggerListener.class.getName() + "'" +
+ "}" +
+ "}";
+ req = createAutoScalingRequest(SolrRequest.METHOD.POST, setListenerCommand);
+ response = solrClient.request(req);
+ assertEquals(response.get("result").toString(), "success");
+
+
// create a collection with 1 replica per node
String collectionName = "testNodeLost";
CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionName,
@@ -480,7 +510,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
create.process(solrClient);
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
- CloudTestUtils.clusterShape(NUM_NODES / 5, NUM_NODES / 10)) + " ms");
+ CloudTestUtils.clusterShape(NUM_NODES / 5, NUM_NODES / 10, false, true)) + " ms");
// start killing nodes
int numNodes = NUM_NODES / 5;
@@ -491,7 +521,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
cluster.simRemoveNode(nodes.get(i), false);
cluster.getTimeSource().sleep(killDelay);
}
- // wait for the trigger to fire at least once
+ // wait for the trigger to fire and complete at least once
boolean await = triggerFinishedLatch.await(20 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS);
assertTrue("trigger did not fire within timeout, " +
"waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
@@ -513,16 +543,23 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
"waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
startedEventPos > -1);
SolrInputDocument startedEvent = systemColl.get(startedEventPos);
- // wait until started == finished
+ // we can expect some failures when target node in MOVEREPLICA has been killed
+ // between when the event processing started and the actual moment of MOVEREPLICA execution
+ // wait until started == (finished + failed)
TimeOut timeOut = new TimeOut(20 * waitFor * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource());
while (!timeOut.hasTimedOut()) {
if (triggerStartedCount.get() == triggerFinishedCount.get()) {
break;
}
+ log.debug("started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get() +
+ ", failed=" + listenerEvents.size());
timeOut.sleep(1000);
}
if (timeOut.hasTimedOut()) {
- fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get());
+ if (triggerStartedCount.get() > triggerFinishedCount.get() + listenerEvents.size()) {
+ fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get() +
+ ", failed=" + listenerEvents.size());
+ }
}
int ignored = 0;
int lastIgnoredPos = startedEventPos;
@@ -546,8 +583,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
"waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
cluster.simGetOpCount("MOVEREPLICA") > 0);
- log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
- CloudTestUtils.clusterShape(NUM_NODES / 5, NUM_NODES / 10)) + " ms");
+ if (listenerEvents.isEmpty()) {
+ // no failed movements - verify collection shape
+ log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
+ CloudTestUtils.clusterShape(NUM_NODES / 5, NUM_NODES / 10, false, true)) + " ms");
+ } else {
+ cluster.getTimeSource().sleep(NUM_NODES * 100);
+ }
int count = 50;
SolrInputDocument finishedEvent = null;
@@ -580,9 +622,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
delta = TimeUnit.NANOSECONDS.toMillis(delta);
log.info("#### System stabilized after " + delta + " ms");
long ops = cluster.simGetOpCount("MOVEREPLICA");
- assertTrue("unexpected number of MOVEREPLICA ops: " + ops + ", " +
+ long expectedMinOps = 40;
+ if (!listenerEvents.isEmpty()) {
+ expectedMinOps = 20;
+ }
+ assertTrue("unexpected number (" + expectedMinOps + ") of MOVEREPLICA ops: " + ops + ", " +
"waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
- ops >= 40);
+ ops >= expectedMinOps);
return delta;
}
@@ -596,7 +642,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
create.process(solrClient);
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 300, TimeUnit.SECONDS,
- CloudTestUtils.clusterShape(2, 10)) + " ms");
+ CloudTestUtils.clusterShape(2, 10, false, true)) + " ms");
// collect the node names for shard1
Set<String> nodes = new HashSet<>();
@@ -617,6 +663,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
"'event' : 'searchRate'," +
"'waitFor' : '" + waitForSeconds + "s'," +
"'aboveRate' : 1.0," +
+ "'aboveNodeRate' : 1.0," +
"'enabled' : true," +
"'actions' : [" +
"{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
@@ -655,7 +702,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
m.forEach((k, v) -> assertEquals(4.0, v.doubleValue(), 0.01));
List<TriggerEvent.Op> ops = (List<TriggerEvent.Op>)ev.event.getProperty(TriggerEvent.REQUESTED_OPS);
assertNotNull(ops);
- assertEquals(3, ops.size());
+ assertEquals(ops.toString(), 1, ops.size());
ops.forEach(op -> {
assertEquals(CollectionParams.CollectionAction.ADDREPLICA, op.getAction());
assertEquals(1, op.getHints().size());
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8dd704ef/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java
index 1adbabb..3237639 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestPolicyCloud.java
@@ -69,7 +69,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
.process(solrClient);
CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", "perReplicaDataColl",
- CloudTestUtils.clusterShape(1, 5));
+ CloudTestUtils.clusterShape(1, 5, false, true));
DocCollection coll = getCollectionState("perReplicaDataColl");
String autoScaleJson = "{" +
" 'cluster-preferences': [" +
@@ -119,7 +119,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
.setPolicy("c1")
.process(solrClient);
CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", collectionName,
- CloudTestUtils.clusterShape(1, 1));
+ CloudTestUtils.clusterShape(1, 1, false, true));
getCollectionState(collectionName).forEachReplica((s, replica) -> assertEquals(nodeId, replica.getNodeName()));
@@ -152,7 +152,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
.setPolicy("c1")
.process(solrClient);
CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", collectionName,
- CloudTestUtils.clusterShape(1, 2));
+ CloudTestUtils.clusterShape(1, 2, false, true));
DocCollection docCollection = getCollectionState(collectionName);
List<Replica> list = docCollection.getReplicas(firstNode);
@@ -257,7 +257,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
CollectionAdminRequest.createCollectionWithImplicitRouter("policiesTest", "conf", "s1", 1, 1, 1)
.process(solrClient);
CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", "policiesTest",
- CloudTestUtils.clusterShape(1, 3));
+ CloudTestUtils.clusterShape(1, 3, false, true));
DocCollection coll = getCollectionState("policiesTest");
@@ -322,7 +322,7 @@ public class TestPolicyCloud extends SimSolrCloudTestCase {
CollectionAdminRequest.createCollectionWithImplicitRouter("policiesTest", "conf", "shard1", 2)
.process(solrClient);
CloudTestUtils.waitForState(cluster, "Timeout waiting for collection to become active", "policiesTest",
- CloudTestUtils.clusterShape(1, 2));
+ CloudTestUtils.clusterShape(1, 2, false, true));
DocCollection rulesCollection = getCollectionState("policiesTest");
Map<String, Object> val = cluster.getNodeStateProvider().getNodeValues(rulesCollection.getReplicas().get(0).getNodeName(), Arrays.asList(