You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by il...@apache.org on 2020/11/27 13:05:09 UTC

[lucene-solr] branch jira/solr-15004 updated: fix bug of multiple replicas of same shard placed on same node

This is an automated email from the ASF dual-hosted git repository.

ilan pushed a commit to branch jira/solr-15004
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/jira/solr-15004 by this push:
     new 427a203  fix bug of multiple replicas of same shard placed on same node
427a203 is described below

commit 427a203c1d6648c6660fb593959a2554d38c615c
Author: Ilan Ginzburg <ig...@salesforce.com>
AuthorDate: Fri Nov 27 14:04:21 2020 +0100

    fix bug of multiple replicas of same shard placed on same node
---
 .../plugins/AffinityPlacementFactory.java          |  54 +++++++----
 .../plugins/AffinityPlacementFactoryTest.java      | 105 ++++++++++-----------
 2 files changed, 87 insertions(+), 72 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactory.java b/solr/core/src/java/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactory.java
index 16c7b41..affd627 100644
--- a/solr/core/src/java/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactory.java
+++ b/solr/core/src/java/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactory.java
@@ -227,13 +227,25 @@ public class AffinityPlacementFactory implements PlacementPluginFactory {
 
       // Let's now iterate on all shards to create replicas for and start finding home sweet homes for the replicas
       for (String shardName : request.getShardNames()) {
+        // Inventory nodes (if any) that already have a replica of any type for the shard, because we can't be placing
+        // additional replicas on these. This data structure is updated after each replica to node assign and is used to
+        // make sure different replica types are not allocated to the same nodes (protecting same node assignments within
+        // a given replica type is done "by construction" in makePlacementDecisions()).
+        Set<Node> nodesWithReplicas = new HashSet<>();
+        Shard shard = solrCollection.getShard(shardName);
+        if (shard != null) {
+          for (Replica r : shard.replicas()) {
+            nodesWithReplicas.add(r.getNode());
+          }
+        }
+
         // Iterate on the replica types in the enum order. We place more strategic replicas first
         // (NRT is more strategic than TLOG more strategic than PULL). This is in case we eventually decide that less
         // strategic replica placement impossibility is not a problem that should lead to replica placement computation
         // failure. Current code does fail if placement is impossible (constraint is at most one replica of a shard on any node).
         for (Replica.ReplicaType replicaType : Replica.ReplicaType.values()) {
           makePlacementDecisions(solrCollection, shardName, availabilityZones, replicaType, request.getCountReplicasToCreate(replicaType),
-              attrValues, replicaTypeToNodes, coresOnNodes, placementPlanFactory, replicaPlacements);
+              attrValues, replicaTypeToNodes, nodesWithReplicas, coresOnNodes, placementPlanFactory, replicaPlacements);
         }
       }
 
@@ -353,44 +365,44 @@ public class AffinityPlacementFactory implements PlacementPluginFactory {
      * <p>The criteria used in this method are, in this order:
      * <ol>
      *     <li>No more than one replica of a given shard on a given node (strictly enforced)</li>
-     *     <li>Balance as much as possible the number of replicas of the given {@link org.apache.solr.cluster.Replica.ReplicaType} over available AZ's.
+     *     <li>Balance as much as possible replicas of a given {@link org.apache.solr.cluster.Replica.ReplicaType} over available AZ's.
      *     This balancing takes into account existing replicas <b>of the corresponding replica type</b>, if any.</li>
-     *     <li>Place replicas is possible on nodes having more than a certain amount of free disk space (note that nodes with a too small
+     *     <li>Place replicas if possible on nodes having more than a certain amount of free disk space (note that nodes with a too small
      *     amount of free disk space were eliminated as placement targets earlier, in {@link #getNodesPerReplicaType}). There's
      *     a threshold here rather than sorting on the amount of free disk space, because sorting on that value would in
      *     practice lead to never considering the number of cores on a node.</li>
      *     <li>Place replicas on nodes having a smaller number of cores (the number of cores considered
-     *     for this decision includes decisions made during the processing of the placement request)</li>
+     *     for this decision includes previous placement decisions made during the processing of the placement request)</li>
      * </ol>
      */
     @SuppressForbidden(reason = "Ordering.arbitrary() has no equivalent in Comparator class. Rather reuse than copy.")
     private void makePlacementDecisions(SolrCollection solrCollection, String shardName, Set<String> availabilityZones,
                                         Replica.ReplicaType replicaType, int numReplicas, final AttributeValues attrValues,
-                                        EnumMap<Replica.ReplicaType, Set<Node>> replicaTypeToNodes, Map<Node, Integer> coresOnNodes,
-                                        PlacementPlanFactory placementPlanFactory, Set<ReplicaPlacement> replicaPlacements) throws PlacementException {
-      // Build the set of candidate nodes, i.e. nodes not having (yet) a replica of the given shard
-      Set<Node> candidateNodes = new HashSet<>(replicaTypeToNodes.get(replicaType));
-
-      // Count existing replicas per AZ. We count only instances of the type of replica for which we need to do placement. This
-      // can be changed in the loop below if we want to count all replicas for the shard.
+                                        EnumMap<Replica.ReplicaType, Set<Node>> replicaTypeToNodes, Set<Node> nodesWithReplicas,
+                                        Map<Node, Integer> coresOnNodes, PlacementPlanFactory placementPlanFactory,
+                                        Set<ReplicaPlacement> replicaPlacements) throws PlacementException {
+      // Count existing replicas per AZ. We count only instances of the type of replica for which we need to do placement.
+      // If we ever want to balance replicas of any type across AZ's (and not each replica type balanced independently),
+      // we'd have to move this data structure to the caller of this method so it can be reused across different replica
+      // type placements for a given shard. Note then that this change would be risky. For example all NRT's and PULL
+      // replicas for a shard my be correctly balanced over three AZ's, but then all NRT can end up in the same AZ...
       Map<String, Integer> azToNumReplicas = new HashMap<>();
-      // Add all "interesting" AZ's, i.e. AZ's for which there's a chance we can do placement.
       for (String az : availabilityZones) {
         azToNumReplicas.put(az, 0);
       }
 
+      // Build the set of candidate nodes for the placement, i.e. nodes that can accept the replica type
+      Set<Node> candidateNodes = new HashSet<>(replicaTypeToNodes.get(replicaType));
+      // Remove nodes that already have a replica for the shard (no two replicas of same shard can be put on same node)
+      candidateNodes.removeAll(nodesWithReplicas);
+
       Shard shard = solrCollection.getShard(shardName);
       if (shard != null) {
         // shard is non null if we're adding replicas to an already existing collection.
         // If we're creating the collection, the shards do not exist yet.
         for (Replica replica : shard.replicas()) {
-          // Nodes already having any type of replica for the shard can't get another replica.
-          candidateNodes.remove(replica.getNode());
-          // The node's AZ has to be counted as having a replica if it has a replica of the same type as the one we need
-          // to place here (remove the "if" below to balance the number of replicas per AZ across all replica types rather
-          // than within each replica type, but then there's a risk that all NRT replicas for example end up on the same AZ).
-          // Note that if in the cluster nodes are configured to accept a single replica type and not multiple ones, the
-          // two options are equivalent (governed by system property AVAILABILITY_ZONE_SYSPROP on each node)
+          // The node's AZ is counted as having a replica if it has a replica of the same type as the one we need
+          // to place here.
           if (replica.getType() == replicaType) {
             final String az = getNodeAZ(replica.getNode(), attrValues);
             if (azToNumReplicas.containsKey(az)) {
@@ -477,6 +489,10 @@ public class AffinityPlacementFactory implements PlacementPluginFactory {
 
         Node assignTarget = nodes.remove(0);
 
+        // Do not assign that node again for replicas of other replica type for this shard
+        // (this update of the set is not useful in the current execution of this method but for following ones only)
+        nodesWithReplicas.add(assignTarget);
+
         // Insert back a corrected entry for the AZ: one more replica living there and one less node that can accept new replicas
         // (the remaining candidate node list might be empty, in which case it will be cleaned up on the next iteration).
         azByExistingReplicas.put(azWithNodesEntry.getKey() + 1, azWithNodes);
diff --git a/solr/core/src/test/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactoryTest.java b/solr/core/src/test/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactoryTest.java
index 8d8937a..095b0e9 100644
--- a/solr/core/src/test/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactoryTest.java
+++ b/solr/core/src/test/org/apache/solr/cluster/placement/plugins/AffinityPlacementFactoryTest.java
@@ -122,14 +122,18 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
     final int LOW_SPACE_NODE_INDEX = 0;
     final int NO_SPACE_NODE_INDEX = 1;
 
-
     // Cluster nodes and their attributes
-    Builders.ClusterBuilder clusterBuilder = Builders.newClusterBuilder().initializeNodes(4);
+    Builders.ClusterBuilder clusterBuilder = Builders.newClusterBuilder().initializeNodes(8);
     LinkedList<Builders.NodeBuilder> nodeBuilders = clusterBuilder.getNodeBuilders();
-    nodeBuilders.get(LOW_SPACE_NODE_INDEX).setCoreCount(1).setFreeDiskGB(MINIMAL_FREE_DISK_GB + 1); // Low space
-    nodeBuilders.get(NO_SPACE_NODE_INDEX).setCoreCount(10).setFreeDiskGB(1L); // Really not enough space
-    nodeBuilders.get(2).setCoreCount(10).setFreeDiskGB(PRIORITIZED_FREE_DISK_GB + 1);
-    nodeBuilders.get(3).setCoreCount(10).setFreeDiskGB(PRIORITIZED_FREE_DISK_GB + 1);
+    for (int i = 0; i < nodeBuilders.size(); i++) {
+      if (i == LOW_SPACE_NODE_INDEX) {
+        nodeBuilders.get(i).setCoreCount(1).setFreeDiskGB(MINIMAL_FREE_DISK_GB + 1); // Low space
+      } else if (i == NO_SPACE_NODE_INDEX) {
+        nodeBuilders.get(i).setCoreCount(10).setFreeDiskGB(1L); // Really not enough space
+      } else {
+        nodeBuilders.get(i).setCoreCount(10).setFreeDiskGB(PRIORITIZED_FREE_DISK_GB + 1);
+      }
+    }
     List<Node> liveNodes = clusterBuilder.buildLiveNodes();
 
     // The collection to create (shards are defined but no replicas)
@@ -144,36 +148,31 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
     PlacementPlan pp = plugin.computePlacement(clusterBuilder.build(), placementRequest, clusterBuilder.buildAttributeFetcher(), new PlacementPlanFactoryImpl());
 
     assertEquals(18, pp.getReplicaPlacements().size()); // 3 shards, 6 replicas total each
-    // Verify no two replicas of same type of same shard placed on same node
-    Set<Pair<Pair<Replica.ReplicaType, String>, Node>> placements = new HashSet<>();
+    Set<Pair<String, Node>> placements = new HashSet<>();
     for (ReplicaPlacement rp : pp.getReplicaPlacements()) {
-      assertTrue("two replicas of same type for same shard placed on same node",
-          placements.add(new Pair<>(new Pair<>(rp.getReplicaType(), rp.getShardName()), rp.getNode())));
+      assertTrue("two replicas for same shard placed on same node", placements.add(new Pair<>(rp.getShardName(), rp.getNode())));
       assertNotEquals("Replica unnecessarily placed on node with low free space", rp.getNode(), liveNodes.get(LOW_SPACE_NODE_INDEX));
       assertNotEquals("Replica placed on node with not enough free space", rp.getNode(), liveNodes.get(NO_SPACE_NODE_INDEX));
     }
 
-    // Verify that if we ask for 3 replicas, the placement will use the low free space node
-    // Place two replicas of each type for each shard
+    // Verify that if we ask for 7 replicas, the placement will use the low free space node
     placementRequest = new PlacementRequestImpl(solrCollection, solrCollection.getShardNames(), new HashSet<>(liveNodes),
-        3, 0, 0);
+        7, 0, 0);
     pp = plugin.computePlacement(clusterBuilder.build(), placementRequest, clusterBuilder.buildAttributeFetcher(), new PlacementPlanFactoryImpl());
-    assertEquals(9, pp.getReplicaPlacements().size()); // 3 shards, 3 replicas each
-    // Verify no two replicas of same shard placed on same node
+    assertEquals(21, pp.getReplicaPlacements().size()); // 3 shards, 7 replicas each
     placements = new HashSet<>();
     for (ReplicaPlacement rp : pp.getReplicaPlacements()) {
       assertEquals("Only NRT replicas should be created", Replica.ReplicaType.NRT, rp.getReplicaType());
-      assertTrue("two replicas for same shard placed on same node",
-          placements.add(new Pair<>(new Pair<>(rp.getReplicaType(), rp.getShardName()), rp.getNode())));
+      assertTrue("two replicas for same shard placed on same node", placements.add(new Pair<>(rp.getShardName(), rp.getNode())));
       assertNotEquals("Replica placed on node with not enough free space", rp.getNode(), liveNodes.get(NO_SPACE_NODE_INDEX));
     }
 
-    // Verify that if we ask for 4 replicas, the placement will fail
+    // Verify that if we ask for 8 replicas, the placement fails
     try {
       placementRequest = new PlacementRequestImpl(solrCollection, solrCollection.getShardNames(), new HashSet<>(liveNodes),
-          4, 0, 0);
+          8, 0, 0);
       plugin.computePlacement(clusterBuilder.build(), placementRequest, clusterBuilder.buildAttributeFetcher(), new PlacementPlanFactoryImpl());
-      fail("Placing 4 replicas should not be possible given only three nodes have enough space");
+      fail("Placing 8 replicas should not be possible given only 7 nodes have enough space");
     } catch (PlacementException e) {
       // expected
     }
@@ -192,7 +191,7 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
 
 
     // Cluster nodes and their attributes
-    Builders.ClusterBuilder clusterBuilder = Builders.newClusterBuilder().initializeNodes(4);
+    Builders.ClusterBuilder clusterBuilder = Builders.newClusterBuilder().initializeNodes(5);
     LinkedList<Builders.NodeBuilder> nodeBuilders = clusterBuilder.getNodeBuilders();
     int coresOnNode = 10;
     for (Builders.NodeBuilder nodeBuilder : nodeBuilders) {
@@ -205,20 +204,20 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
     // Build the collection letting the code pick up nodes...
     collectionBuilder.initializeShardsReplicas(2, 2, 1, 0, nodeBuilders);
     // Now explicitly change the nodes to create the collection distribution we want to test:
-    // (note this collection is an illegal placement: shard 1 has two replicas on node 0. The placement plugin should still
-    // be able to place new replicas as long as they don't break the rules).
-    //  +--------------+----+----+----+----+
-    //  |         Node |  0 |  1 |  2 |  3 |
-    //  |Cores on node | 10 | 20 | 30 | 40 |
-    //  +----------------------------------+
-    //  |   Shard 1:   |    |    |    |    |
-    //  |         NRT  |  X |    |    |  X |
-    //  |         TLOG |  X |    |    |    |
-    //  +----------------------------------+
-    //  |   Shard 2:   |    |    |    |    |
-    //  |         NRT  |    |  X |    |  X |
-    //  |         TLOG |    |    |  X |    |
-    //  +--------------+----+----+----+----+
+    // (note this collection is in an illegal state: shard 1 has two replicas on node 0. The placement plugin would NOT
+    // generate such a placement but should still be able to place additional replicas as long as THEY don't break the rules).
+    //  +--------------+----+----+----+----+----+
+    //  |         Node |  0 |  1 |  2 |  3 |  4 |
+    //  |Cores on node | 10 | 20 | 30 | 40 | 50 |
+    //  +----------------------------------+----+
+    //  |   Shard 1:   |    |    |    |    |    |
+    //  |         NRT  |  X |    |    |  X |    |
+    //  |         TLOG |  X |    |    |    |    |
+    //  +----------------------------------+----+
+    //  |   Shard 2:   |    |    |    |    |    |
+    //  |         NRT  |    |  X |    |  X |    |
+    //  |         TLOG |    |    |  X |    |    |
+    //  +--------------+----+----+----+----+----+
 
     // The code below is not ideal... We only modify the parts of the collection that we need to change (replica nodes).
     // If this only happens in this test then it is likely the simplest approach.
@@ -274,18 +273,18 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
     // i.e. those that do not already have a replica for the shard and then on the node with the lowest
     // number of cores. NRT are placed first.
     // We therefore expect the placement of the new replicas to look like:
-    //  +--------------+----+----+----+----+
-    //  |         Node |  0 |  1 |  2 |  3 |
-    //  |Cores on node | 10 | 20 | 30 | 40 |
-    //  +----------------------------------+
-    //  |   Shard 1:   |    |    |    |    |
-    //  |         NRT  |  X |  N |    |  X |
-    //  |         TLOG |  X |    |  N |    |
-    //  +----------------------------------+
-    //  |   Shard 2:   |    |    |    |    |
-    //  |         NRT  |  N |  X |    |  X |
-    //  |         TLOG |    |  N |  X |    | <-- We don't really expect this. It should be impossible to place this TLOG with 4 nodes
-    //  +--------------+----+----+----+----+
+    //  +--------------+----+----+----+----+----+
+    //  |         Node |  0 |  1 |  2 |  3 |  4 |
+    //  |Cores on node | 10 | 20 | 30 | 40 | 50 |
+    //  +----------------------------------+----+
+    //  |   Shard 1:   |    |    |    |    |    |
+    //  |         NRT  |  X |  N |    |  X |    |
+    //  |         TLOG |  X |    |  N |    |    |
+    //  +----------------------------------+----+
+    //  |   Shard 2:   |    |    |    |    |    |
+    //  |         NRT  |  N |  X |    |  X |    |
+    //  |         TLOG |    |    |  X |    |  N |
+    //  +--------------+----+----+----+----+----+
 
 
     // Place two replicas of each type for each shard
@@ -298,7 +297,7 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
     Set<ReplicaPlacement> replicaPlacements = pp.getReplicaPlacements();
 
     // Each expected placement is represented as a string "shard replica-type node"
-    Set<String> expectedPlacements = Set.of("1 NRT 1", "1 TLOG 2", "2 NRT 0", "2 TLOG 1");
+    Set<String> expectedPlacements = Set.of("1 NRT 1", "1 TLOG 2", "2 NRT 0", "2 TLOG 4");
     verifyPlacements(expectedPlacements, replicaPlacements, shardBuilders, liveNodes);
   }
 
@@ -398,11 +397,11 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
       Builders.NodeBuilder nodeBuilder = clusterBuilder.getNodeBuilders().get(i);
       nodeBuilder.setCoreCount(0);
       nodeBuilder.setFreeDiskGB(100L);
-      if (i < NUM_NODES / 2) {
-        nodeBuilder.setSysprop(AffinityPlacementFactory.REPLICA_TYPE_SYSPROP, "Nrt,Tlog");
+      if (i < NUM_NODES / 3 * 2) {
+        nodeBuilder.setSysprop(AffinityPlacementFactory.REPLICA_TYPE_SYSPROP, "Nrt, TlOg");
         nodeBuilder.setSysprop("group", "one");
       } else {
-        nodeBuilder.setSysprop(AffinityPlacementFactory.REPLICA_TYPE_SYSPROP, "Pull, foobar");
+        nodeBuilder.setSysprop(AffinityPlacementFactory.REPLICA_TYPE_SYSPROP, "Pull,foobar");
         nodeBuilder.setSysprop("group", "two");
       }
     }
@@ -485,12 +484,12 @@ public class AffinityPlacementFactoryTest extends SolrTestCaseJ4 {
     PlacementRequestImpl placementRequest = new PlacementRequestImpl(solrCollection,
         StreamSupport.stream(solrCollection.shards().spliterator(), false)
             .map(Shard::getShardName).collect(Collectors.toSet()),
-        cluster.getLiveNodes(), 2, 0, 2);
+        cluster.getLiveNodes(), 1, 0, 1);
 
     PlacementPlanFactory placementPlanFactory = new PlacementPlanFactoryImpl();
     AttributeFetcher attributeFetcher = clusterBuilder.buildAttributeFetcher();
     PlacementPlan pp = plugin.computePlacement(cluster, placementRequest, attributeFetcher, placementPlanFactory);
-    assertEquals(8, pp.getReplicaPlacements().size());
+    assertEquals(4, pp.getReplicaPlacements().size());
     for (ReplicaPlacement rp : pp.getReplicaPlacements()) {
       assertFalse("should not put any replicas on " + smallNode, rp.getNode().equals(smallNode));
     }