You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by il...@apache.org on 2020/11/27 13:05:09 UTC
[lucene-solr] branch jira/solr-15004 updated: fix bug of multiple
replicas of same shard placed on same node
This is an automated email from the ASF dual-hosted git repository.
ilan pushed a commit to branch jira/solr-15004
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/jira/solr-15004 by this push:
new 427a203 fix bug of multiple replicas of same shard placed on same node
427a203 is described below
commit 427a203c1d6648c6660fb593959a2554d38c615c
Author: Ilan Ginzburg <ig...@salesforce.com>
AuthorDate: Fri Nov 27 14:04:21 2020 +0100
fix bug of multiple replicas of same shard placed on same node
---
.../plugins/AffinityPlacementFactory.java | 54 +++++++----
.../plugins/AffinityPlacementFactoryTest.java | 105 ++++++++++-----------
2 files changed, 87 insertions(+), 72 deletions(-)
diff --git a/solr/core/src/java/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactory.java b/solr/core/src/java/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactory.java
index 16c7b41..affd627 100644
--- a/solr/core/src/java/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactory.java
+++ b/solr/core/src/java/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactory.java
@@ -227,13 +227,25 @@ public class AffinityPlacementFactory implements PlacementPluginFactory {
// Let's now iterate on all shards to create replicas for and start finding home sweet homes for the replicas
for (String shardName : request.getShardNames()) {
+ // Inventory nodes (if any) that already have a replica of any type for the shard, because we can't be placing
+ // additional replicas on these. This data structure is updated after each replica to node assign and is used to
+ // make sure different replica types are not allocated to the same nodes (protecting same node assignments within
+ // a given replica type is done "by construction" in makePlacementDecisions()).
+ Set<Node> nodesWithReplicas = new HashSet<>();
+ Shard shard = solrCollection.getShard(shardName);
+ if (shard != null) {
+ for (Replica r : shard.replicas()) {
+ nodesWithReplicas.add(r.getNode());
+ }
+ }
+
// Iterate on the replica types in the enum order. We place more strategic replicas first
// (NRT is more strategic than TLOG more strategic than PULL). This is in case we eventually decide that less
// strategic replica placement impossibility is not a problem that should lead to replica placement computation
// failure. Current code does fail if placement is impossible (constraint is at most one replica of a shard on any node).
for (Replica.ReplicaType replicaType : Replica.ReplicaType.values()) {
makePlacementDecisions(solrCollection, shardName, availabilityZones, replicaType, request.getCountReplicasToCreate(replicaType),
- attrValues, replicaTypeToNodes, coresOnNodes, placementPlanFactory, replicaPlacements);
+ attrValues, replicaTypeToNodes, nodesWithReplicas, coresOnNodes, placementPlanFactory, replicaPlacements);
}
}
@@ -353,44 +365,44 @@ public class AffinityPlacementFactory implements PlacementPluginFactory {
* <p>The criteria used in this method are, in this order:
* <ol>
* <li>No more than one replica of a given shard on a given node (strictly enforced)</li>
- * <li>Balance as much as possible the number of replicas of the given {@link org.apache.solr.cluster.Replica.ReplicaType} over available AZ's.
+ * <li>Balance as much as possible replicas of a given {@link org.apache.solr.cluster.Replica.ReplicaType} over available AZ's.
* This balancing takes into account existing replicas <b>of the corresponding replica type</b>, if any.</li>
- * <li>Place replicas is possible on nodes having more than a certain amount of free disk space (note that nodes with a too small
+ * <li>Place replicas if possible on nodes having more than a certain amount of free disk space (note that nodes with a too small
* amount of free disk space were eliminated as placement targets earlier, in {@link #getNodesPerReplicaType}). There's
* a threshold here rather than sorting on the amount of free disk space, because sorting on that value would in
* practice lead to never considering the number of cores on a node.</li>
* <li>Place replicas on nodes having a smaller number of cores (the number of cores considered
- * for this decision includes decisions made during the processing of the placement request)</li>
+ * for this decision includes previous placement decisions made during the processing of the placement request)</li>
* </ol>
*/
@SuppressForbidden(reason = "Ordering.arbitrary() has no equivalent in Comparator class. Rather reuse than copy.")
private void makePlacementDecisions(SolrCollection solrCollection, String shardName, Set<String> availabilityZones,
Replica.ReplicaType replicaType, int numReplicas, final AttributeValues attrValues,
- EnumMap<Replica.ReplicaType, Set<Node>> replicaTypeToNodes, Map<Node, Integer> coresOnNodes,
- PlacementPlanFactory placementPlanFactory, Set<ReplicaPlacement> replicaPlacements) throws PlacementException {
- // Build the set of candidate nodes, i.e. nodes not having (yet) a replica of the given shard
- Set<Node> candidateNodes = new HashSet<>(replicaTypeToNodes.get(replicaType));
-
- // Count existing replicas per AZ. We count only instances of the type of replica for which we need to do placement. This
- // can be changed in the loop below if we want to count all replicas for the shard.
+ EnumMap<Replica.ReplicaType, Set<Node>> replicaTypeToNodes, Set<Node> nodesWithReplicas,
+ Map<Node, Integer> coresOnNodes, PlacementPlanFactory placementPlanFactory,
+ Set<ReplicaPlacement> replicaPlacements) throws PlacementException {
+ // Count existing replicas per AZ. We count only instances of the type of replica for which we need to do placement.
+ // If we ever want to balance replicas of any type across AZ's (and not each replica type balanced independently),
+ // we'd have to move this data structure to the caller of this method so it can be reused across different replica
+ // type placements for a given shard. Note then that this change would be risky. For example all NRT's and PULL
+ // replicas for a shard my be correctly balanced over three AZ's, but then all NRT can end up in the same AZ...
Map<String, Integer> azToNumReplicas = new HashMap<>();
- // Add all "interesting" AZ's, i.e. AZ's for which there's a chance we can do placement.
for (String az : availabilityZones) {
azToNumReplicas.put(az, 0);
}
+ // Build the set of candidate nodes for the placement, i.e. nodes that can accept the replica type
+ Set<Node> candidateNodes = new HashSet<>(replicaTypeToNodes.get(replicaType));
+ // Remove nodes that already have a replica for the shard (no two replicas of same shard can be put on same node)
+ candidateNodes.removeAll(nodesWithReplicas);
+
Shard shard = solrCollection.getShard(shardName);
if (shard != null) {
// shard is non null if we're adding replicas to an already existing collection.
// If we're creating the collection, the shards do not exist yet.
for (Replica replica : shard.replicas()) {
- // Nodes already having any type of replica for the shard can't get another replica.
- candidateNodes.remove(replica.getNode());
- // The node's AZ has to be counted as having a replica if it has a replica of the same type as the one we need
- // to place here (remove the "if" below to balance the number of replicas per AZ across all replica types rather
- // than within each replica type, but then there's a risk that all NRT replicas for example end up on the same AZ).
- // Note that if in the cluster nodes are configured to accept a single replica type and not multiple ones, the
- // two options are equivalent (governed by system property AVAILABILITY_ZONE_SYSPROP on each node)
+ // The node's AZ is counted as having a replica if it has a replica of the same type as the one we need
+ // to place here.
if (replica.getType() == replicaType) {
final String az = getNodeAZ(replica.getNode(), attrValues);
if (azToNumReplicas.containsKey(az)) {
@@ -477,6 +489,10 @@ public class AffinityPlacementFactory implements PlacementPluginFactory {
Node assignTarget = nodes.remove(0);
+ // Do not assign that node again for replicas of other replica type for this shard
+ // (this update of the set is not useful in the current execution of this method but for following ones only)
+ nodesWithReplicas.add(assignTarget);
+
// Insert back a corrected entry for the AZ: one more replica living there and one less node that can accept new replicas
// (the remaining candidate node list might be empty, in which case it will be cleaned up on the next iteration).
azByExistingReplicas.put(azWithNodesEntry.getKey() + 1, azWithNodes);
diff --git a/solr/core/src/test/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactoryTest.java b/solr/core/src/test/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactoryTest.java
index 8d8937a..095b0e9 100644
--- a/solr/core/src/test/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactoryTest.java
+++ b/solr/core/src/test/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactoryTest.java
@@ -122,14 +122,18 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
final int LOW_SPACE_NODE_INDEX = 0;
final int NO_SPACE_NODE_INDEX = 1;
-
// Cluster nodes and their attributes
- Builders.ClusterBuilder clusterBuilder = Builders.newClusterBuilder().initializeNodes(4);
+ Builders.ClusterBuilder clusterBuilder = Builders.newClusterBuilder().initializeNodes(8);
LinkedList<Builders.NodeBuilder> nodeBuilders = clusterBuilder.getNodeBuilders();
- nodeBuilders.get(LOW_SPACE_NODE_INDEX).setCoreCount(1).setFreeDiskGB(MINIMAL_FREE_DISK_GB + 1); // Low space
- nodeBuilders.get(NO_SPACE_NODE_INDEX).setCoreCount(10).setFreeDiskGB(1L); // Really not enough space
- nodeBuilders.get(2).setCoreCount(10).setFreeDiskGB(PRIORITIZED_FREE_DISK_GB + 1);
- nodeBuilders.get(3).setCoreCount(10).setFreeDiskGB(PRIORITIZED_FREE_DISK_GB + 1);
+ for (int i = 0; i < nodeBuilders.size(); i++) {
+ if (i == LOW_SPACE_NODE_INDEX) {
+ nodeBuilders.get(i).setCoreCount(1).setFreeDiskGB(MINIMAL_FREE_DISK_GB + 1); // Low space
+ } else if (i == NO_SPACE_NODE_INDEX) {
+ nodeBuilders.get(i).setCoreCount(10).setFreeDiskGB(1L); // Really not enough space
+ } else {
+ nodeBuilders.get(i).setCoreCount(10).setFreeDiskGB(PRIORITIZED_FREE_DISK_GB + 1);
+ }
+ }
List<Node> liveNodes = clusterBuilder.buildLiveNodes();
// The collection to create (shards are defined but no replicas)
@@ -144,36 +148,31 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
PlacementPlan pp = plugin.computePlacement(clusterBuilder.build(), placementRequest, clusterBuilder.buildAttributeFetcher(), new PlacementPlanFactoryImpl());
assertEquals(18, pp.getReplicaPlacements().size()); // 3 shards, 6 replicas total each
- // Verify no two replicas of same type of same shard placed on same node
- Set<Pair<Pair<Replica.ReplicaType, String>, Node>> placements = new HashSet<>();
+ Set<Pair<String, Node>> placements = new HashSet<>();
for (ReplicaPlacement rp : pp.getReplicaPlacements()) {
- assertTrue("two replicas of same type for same shard placed on same node",
- placements.add(new Pair<>(new Pair<>(rp.getReplicaType(), rp.getShardName()), rp.getNode())));
+ assertTrue("two replicas for same shard placed on same node", placements.add(new Pair<>(rp.getShardName(), rp.getNode())));
assertNotEquals("Replica unnecessarily placed on node with low free space", rp.getNode(), liveNodes.get(LOW_SPACE_NODE_INDEX));
assertNotEquals("Replica placed on node with not enough free space", rp.getNode(), liveNodes.get(NO_SPACE_NODE_INDEX));
}
- // Verify that if we ask for 3 replicas, the placement will use the low free space node
- // Place two replicas of each type for each shard
+ // Verify that if we ask for 7 replicas, the placement will use the low free space node
placementRequest = new PlacementRequestImpl(solrCollection, solrCollection.getShardNames(), new HashSet<>(liveNodes),
- 3, 0, 0);
+ 7, 0, 0);
pp = plugin.computePlacement(clusterBuilder.build(), placementRequest, clusterBuilder.buildAttributeFetcher(), new PlacementPlanFactoryImpl());
- assertEquals(9, pp.getReplicaPlacements().size()); // 3 shards, 3 replicas each
- // Verify no two replicas of same shard placed on same node
+ assertEquals(21, pp.getReplicaPlacements().size()); // 3 shards, 7 replicas each
placements = new HashSet<>();
for (ReplicaPlacement rp : pp.getReplicaPlacements()) {
assertEquals("Only NRT replicas should be created", Replica.ReplicaType.NRT, rp.getReplicaType());
- assertTrue("two replicas for same shard placed on same node",
- placements.add(new Pair<>(new Pair<>(rp.getReplicaType(), rp.getShardName()), rp.getNode())));
+ assertTrue("two replicas for same shard placed on same node", placements.add(new Pair<>(rp.getShardName(), rp.getNode())));
assertNotEquals("Replica placed on node with not enough free space", rp.getNode(), liveNodes.get(NO_SPACE_NODE_INDEX));
}
- // Verify that if we ask for 4 replicas, the placement will fail
+ // Verify that if we ask for 8 replicas, the placement fails
try {
placementRequest = new PlacementRequestImpl(solrCollection, solrCollection.getShardNames(), new HashSet<>(liveNodes),
- 4, 0, 0);
+ 8, 0, 0);
plugin.computePlacement(clusterBuilder.build(), placementRequest, clusterBuilder.buildAttributeFetcher(), new PlacementPlanFactoryImpl());
- fail("Placing 4 replicas should not be possible given only three nodes have enough space");
+ fail("Placing 8 replicas should not be possible given only 7 nodes have enough space");
} catch (PlacementException e) {
// expected
}
@@ -192,7 +191,7 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
// Cluster nodes and their attributes
- Builders.ClusterBuilder clusterBuilder = Builders.newClusterBuilder().initializeNodes(4);
+ Builders.ClusterBuilder clusterBuilder = Builders.newClusterBuilder().initializeNodes(5);
LinkedList<Builders.NodeBuilder> nodeBuilders = clusterBuilder.getNodeBuilders();
int coresOnNode = 10;
for (Builders.NodeBuilder nodeBuilder : nodeBuilders) {
@@ -205,20 +204,20 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
// Build the collection letting the code pick up nodes...
collectionBuilder.initializeShardsReplicas(2, 2, 1, 0, nodeBuilders);
// Now explicitly change the nodes to create the collection distribution we want to test:
- // (note this collection is an illegal placement: shard 1 has two replicas on node 0. The placement plugin should still
- // be able to place new replicas as long as they don't break the rules).
- // +--------------+----+----+----+----+
- // | Node | 0 | 1 | 2 | 3 |
- // |Cores on node | 10 | 20 | 30 | 40 |
- // +----------------------------------+
- // | Shard 1: | | | | |
- // | NRT | X | | | X |
- // | TLOG | X | | | |
- // +----------------------------------+
- // | Shard 2: | | | | |
- // | NRT | | X | | X |
- // | TLOG | | | X | |
- // +--------------+----+----+----+----+
+ // (note this collection is in an illegal state: shard 1 has two replicas on node 0. The placement plugin would NOT
+ // generate such a placement but should still be able to place additional replicas as long as THEY don't break the rules).
+ // +--------------+----+----+----+----+----+
+ // | Node | 0 | 1 | 2 | 3 | 4 |
+ // |Cores on node | 10 | 20 | 30 | 40 | 50 |
+ // +----------------------------------+----+
+ // | Shard 1: | | | | | |
+ // | NRT | X | | | X | |
+ // | TLOG | X | | | | |
+ // +----------------------------------+----+
+ // | Shard 2: | | | | | |
+ // | NRT | | X | | X | |
+ // | TLOG | | | X | | |
+ // +--------------+----+----+----+----+----+
// The code below is not ideal... We only modify the parts of the collection that we need to change (replica nodes).
// If this only happens in this test then it is likely the simplest approach.
@@ -274,18 +273,18 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
// i.e. those that do not already have a replica for the shard and then on the node with the lowest
// number of cores. NRT are placed first.
// We therefore expect the placement of the new replicas to look like:
- // +--------------+----+----+----+----+
- // | Node | 0 | 1 | 2 | 3 |
- // |Cores on node | 10 | 20 | 30 | 40 |
- // +----------------------------------+
- // | Shard 1: | | | | |
- // | NRT | X | N | | X |
- // | TLOG | X | | N | |
- // +----------------------------------+
- // | Shard 2: | | | | |
- // | NRT | N | X | | X |
- // | TLOG | | N | X | | <-- We don't really expect this. It should be impossible to place this TLOG with 4 nodes
- // +--------------+----+----+----+----+
+ // +--------------+----+----+----+----+----+
+ // | Node | 0 | 1 | 2 | 3 | 4 |
+ // |Cores on node | 10 | 20 | 30 | 40 | 50 |
+ // +----------------------------------+----+
+ // | Shard 1: | | | | | |
+ // | NRT | X | N | | X | |
+ // | TLOG | X | | N | | |
+ // +----------------------------------+----+
+ // | Shard 2: | | | | | |
+ // | NRT | N | X | | X | |
+ // | TLOG | | | X | | N |
+ // +--------------+----+----+----+----+----+
// Place two replicas of each type for each shard
@@ -298,7 +297,7 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
Set<ReplicaPlacement> replicaPlacements = pp.getReplicaPlacements();
// Each expected placement is represented as a string "shard replica-type node"
- Set<String> expectedPlacements = Set.of("1 NRT 1", "1 TLOG 2", "2 NRT 0", "2 TLOG 1");
+ Set<String> expectedPlacements = Set.of("1 NRT 1", "1 TLOG 2", "2 NRT 0", "2 TLOG 4");
verifyPlacements(expectedPlacements, replicaPlacements, shardBuilders, liveNodes);
}
@@ -398,11 +397,11 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
Builders.NodeBuilder nodeBuilder = clusterBuilder.getNodeBuilders().get(i);
nodeBuilder.setCoreCount(0);
nodeBuilder.setFreeDiskGB(100L);
- if (i < NUM_NODES / 2) {
- nodeBuilder.setSysprop(AffinityPlacementFactory.REPLICA_TYPE_SYSPROP, "Nrt,Tlog");
+ if (i < NUM_NODES / 3 * 2) {
+ nodeBuilder.setSysprop(AffinityPlacementFactory.REPLICA_TYPE_SYSPROP, "Nrt, TlOg");
nodeBuilder.setSysprop("group", "one");
} else {
- nodeBuilder.setSysprop(AffinityPlacementFactory.REPLICA_TYPE_SYSPROP, "Pull, foobar");
+ nodeBuilder.setSysprop(AffinityPlacementFactory.REPLICA_TYPE_SYSPROP, "Pull,foobar");
nodeBuilder.setSysprop("group", "two");
}
}
@@ -485,12 +484,12 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
PlacementRequestImpl placementRequest = new PlacementRequestImpl(solrCollection,
StreamSupport.stream(solrCollection.shards().spliterator(), false)
.map(Shard::getShardName).collect(Collectors.toSet()),
- cluster.getLiveNodes(), 2, 0, 2);
+ cluster.getLiveNodes(), 1, 0, 1);
PlacementPlanFactory placementPlanFactory = new PlacementPlanFactoryImpl();
AttributeFetcher attributeFetcher = clusterBuilder.buildAttributeFetcher();
PlacementPlan pp = plugin.computePlacement(cluster, placementRequest, attributeFetcher, placementPlanFactory);
- assertEquals(8, pp.getReplicaPlacements().size());
+ assertEquals(4, pp.getReplicaPlacements().size());
for (ReplicaPlacement rp : pp.getReplicaPlacements()) {
assertFalse("should not put any replicas on " + smallNode, rp.getNode().equals(smallNode));
}