You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2021/11/16 21:33:17 UTC

[lucene-solr] branch branch_8_11 updated: SOLR-15795: Fix REPLACENODE to not use source node (#414)

This is an automated email from the ASF dual-hosted git repository.

houston pushed a commit to branch branch_8_11
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8_11 by this push:
     new 7cd45b7  SOLR-15795: Fix REPLACENODE to not use source node (#414)
7cd45b7 is described below

commit 7cd45b750d4f7b1ae360e83fdf74a5f6ae183eaf
Author: Houston Putman <ho...@apache.org>
AuthorDate: Tue Nov 16 16:05:44 2021 -0500

    SOLR-15795: Fix REPLACENODE to not use source node (#414)
---
 solr/CHANGES.txt                                   |  2 +
 .../apache/solr/cloud/api/collections/Assign.java  | 23 +++++++
 .../solr/cloud/api/collections/ReplaceNodeCmd.java |  5 +-
 .../org/apache/solr/cloud/ReplaceNodeTest.java     | 73 ++++++++++++++--------
 4 files changed, 75 insertions(+), 28 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index f922881..8fa5d06 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -24,6 +24,8 @@ Bug Fixes
 ---------------------
 * SOLR-15635: Don't close hooks twice when SolrRequestInfo is cleared twice; or /export with classic join
   closed fromCore if provided (Mikhail Khludnev, David Smiley)
+  
+* SOLR-15795: Fix REPLACENODE to not use source node when choosing a target node for new replicas (Houston Putman)
 
 Build
 ---------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java
index 8f3b247..138f5af 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java
@@ -453,6 +453,26 @@ public class Assign {
     return nodeNameVsShardCount;
   }
 
+  // throw an exception if all nodes in the supplied list are not live.
+  // Empty list will also fail.
+  // Returns the input
+  private static List<String> checkAnyLiveNodes(List<String> createNodeList, ClusterState clusterState) {
+    Set<String> liveNodes = clusterState.getLiveNodes();
+    if (createNodeList == null) {
+      createNodeList = Collections.emptyList();
+    }
+    boolean anyLiveNodes = false;
+    for (String node : createNodeList) {
+      anyLiveNodes |= liveNodes.contains(node);
+    }
+    if (!anyLiveNodes) {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+          "None of the node(s) specified " + createNodeList + " are currently active in "
+              + liveNodes + ", no action taken.");
+    }
+    return createNodeList; // unmodified, but return for inline use. Only modified if empty, and that will throw an error
+  }
+
   /**
    * Thrown if there is an exception while assigning nodes for replicas
    */
@@ -559,6 +579,9 @@ public class Assign {
         nodeList = sortedNodeList.stream().map(replicaCount -> replicaCount.nodeName).collect(Collectors.toList());
       }
 
+      // Throw an error if there aren't any live nodes.
+      checkAnyLiveNodes(nodeList, solrCloudManager.getClusterStateProvider().getClusterState());
+
       int i = 0;
       List<ReplicaPosition> result = new ArrayList<>();
       for (String aShard : assignRequest.shardNames)
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/ReplaceNodeCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/ReplaceNodeCmd.java
index f1c1f8c..238adee 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/ReplaceNodeCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/ReplaceNodeCmd.java
@@ -28,6 +28,7 @@ import java.util.Map;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicReference;
+import java.util.stream.Collectors;
 
 import org.apache.solr.client.solrj.cloud.autoscaling.PolicyHelper;
 import org.apache.solr.cloud.ActiveReplicaWatcher;
@@ -81,6 +82,8 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
     }
     if (target != null && !clusterState.liveNodesContain(target)) {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Target Node: " + target + " is not live");
+    } else if (clusterState.getLiveNodes().size() <= 1) {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "No nodes other than the source node: " + source + " are live, therefore replicas cannot be moved");
     }
     List<ZkNodeProps> sourceReplicas = getReplicasOfNode(source, clusterState);
     // how many leaders are we moving? for these replicas we have to make sure that either:
@@ -122,7 +125,7 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
               .assignNrtReplicas(numNrtReplicas)
               .assignTlogReplicas(numTlogReplicas)
               .assignPullReplicas(numPullReplicas)
-              .onNodes(new ArrayList<>(ocmh.cloudManager.getClusterStateProvider().getLiveNodes()))
+              .onNodes(new ArrayList<>(ocmh.cloudManager.getClusterStateProvider().getLiveNodes().stream().filter(node -> !node.equals(source)).collect(Collectors.toList())))
               .build();
           Assign.AssignStrategyFactory assignStrategyFactory = new Assign.AssignStrategyFactory(ocmh.cloudManager);
           Assign.AssignStrategy assignStrategy = assignStrategyFactory.create(clusterState, clusterState.getCollection(sourceCollection));
diff --git a/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java b/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java
index 7c1a7ba..ef7d607 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java
@@ -22,7 +22,6 @@ import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.EnumSet;
-import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -35,6 +34,7 @@ import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.client.solrj.request.CoreAdminRequest;
 import org.apache.solr.client.solrj.response.CoreAdminResponse;
 import org.apache.solr.client.solrj.response.RequestStatusState;
+import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
@@ -44,6 +44,7 @@ import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.StrUtils;
 import org.apache.solr.metrics.MetricsMap;
 import org.apache.solr.metrics.SolrMetricManager;
+import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.slf4j.Logger;
@@ -54,9 +55,12 @@ public class ReplaceNodeTest extends SolrCloudTestCase {
   @BeforeClass
   public static void setupCluster() throws Exception {
     System.setProperty("metricsEnabled", "true");
-    configureCluster(6)
-        .addConfig("conf1", TEST_PATH().resolve("configsets").resolve("cloud-dynamic").resolve("conf"))
-        .configure();
+  }
+
+  @Before
+  public void clearPreviousCluster() throws Exception {
+    // Clear the previous cluster before each test, since they use different numbers of nodes.
+    shutdownCluster();
   }
 
   protected String getSolrXml() {
@@ -65,6 +69,9 @@ public class ReplaceNodeTest extends SolrCloudTestCase {
 
   @Test
   public void test() throws Exception {
+    configureCluster(6)
+        .addConfig("conf1", TEST_PATH().resolve("configsets").resolve("cloud-dynamic").resolve("conf"))
+        .configure();
     String coll = "replacenodetest_coll";
     if (log.isInfoEnabled()) {
       log.info("total_jettys: {}", cluster.getJettySolrRunners().size());
@@ -79,23 +86,23 @@ public class ReplaceNodeTest extends SolrCloudTestCase {
     CollectionAdminRequest.Create create;
     // NOTE: always using the createCollection that takes in 'int' for all types of replicas, so we never
     // have to worry about null checking when comparing the Create command with the final Slices
-    
+
     // TODO: tlog replicas do not work correctly in tests due to fault TestInjection#waitForInSyncWithLeader
     create = pickRandom(
-                        CollectionAdminRequest.createCollection(coll, "conf1", 5, 2,0,0),
-                        //CollectionAdminRequest.createCollection(coll, "conf1", 5, 1,1,0),
-                        //CollectionAdminRequest.createCollection(coll, "conf1", 5, 0,1,1),
-                        //CollectionAdminRequest.createCollection(coll, "conf1", 5, 1,0,1),
-                        //CollectionAdminRequest.createCollection(coll, "conf1", 5, 0,2,0),
-                        // check also replicationFactor 1
-                        CollectionAdminRequest.createCollection(coll, "conf1", 5, 1,0,0)
-                        //CollectionAdminRequest.createCollection(coll, "conf1", 5, 0,1,0)
+        CollectionAdminRequest.createCollection(coll, "conf1", 5, 2,0,0),
+        //CollectionAdminRequest.createCollection(coll, "conf1", 5, 1,1,0),
+        //CollectionAdminRequest.createCollection(coll, "conf1", 5, 0,1,1),
+        //CollectionAdminRequest.createCollection(coll, "conf1", 5, 1,0,1),
+        //CollectionAdminRequest.createCollection(coll, "conf1", 5, 0,2,0),
+        // check also replicationFactor 1
+        CollectionAdminRequest.createCollection(coll, "conf1", 5, 1,0,0)
+        //CollectionAdminRequest.createCollection(coll, "conf1", 5, 0,1,0)
     );
     create.setCreateNodeSet(StrUtils.join(l, ',')).setMaxShardsPerNode(3);
     cloudClient.request(create);
-    
+
     cluster.waitForActiveCollection(coll, 5, 5 * (create.getNumNrtReplicas() + create.getNumPullReplicas() + create.getNumTlogReplicas()));
-    
+
     DocCollection collection = cloudClient.getZkStateReader().getClusterState().getCollection(coll);
     log.debug("### Before decommission: {}", collection);
     log.info("excluded_node : {}  ", emptyNode);
@@ -108,13 +115,13 @@ public class ReplaceNodeTest extends SolrCloudTestCase {
         success = true;
         break;
       }
-      assertFalse(rsp.getRequestStatus() == RequestStatusState.FAILED);
+      assertNotSame(rsp.getRequestStatus(), RequestStatusState.FAILED);
       Thread.sleep(50);
     }
     assertTrue(success);
     try (HttpSolrClient coreclient = getHttpSolrClient(cloudClient.getZkStateReader().getBaseUrlForNodeName(node2bdecommissioned))) {
       CoreAdminResponse status = CoreAdminRequest.getStatus(null, coreclient);
-      assertTrue(status.getCoreStatus().size() == 0);
+      assertEquals(0, status.getCoreStatus().size());
     }
 
     Thread.sleep(5000);
@@ -139,7 +146,7 @@ public class ReplaceNodeTest extends SolrCloudTestCase {
         success = true;
         break;
       }
-      assertFalse(rsp.getRequestStatus() == RequestStatusState.FAILED);
+      assertNotSame(rsp.getRequestStatus(), RequestStatusState.FAILED);
       Thread.sleep(50);
     }
     assertTrue(success);
@@ -157,14 +164,7 @@ public class ReplaceNodeTest extends SolrCloudTestCase {
     }
     // make sure all newly created replicas on node are active
     List<Replica> newReplicas = collection.getReplicas(node2bdecommissioned);
-    replicas.forEach(r -> {
-      for (Iterator<Replica> it = newReplicas.iterator(); it.hasNext(); ) {
-        Replica nr = it.next();
-        if (nr.getName().equals(r.getName())) {
-          it.remove();
-        }
-      }
-    });
+    replicas.forEach(r -> newReplicas.removeIf(nr -> nr.getName().equals(r.getName())));
     assertFalse(newReplicas.isEmpty());
     for (Replica r : newReplicas) {
       assertEquals(r.toString(), Replica.State.ACTIVE, r.getState());
@@ -173,7 +173,7 @@ public class ReplaceNodeTest extends SolrCloudTestCase {
     replicas = collection.getReplicas(emptyNode);
     if (replicas != null) {
       for (Replica r : replicas) {
-        assertFalse(r.toString(), Replica.State.ACTIVE.equals(r.getState()));
+        assertNotEquals(r.toString(), Replica.State.ACTIVE, r.getState());
       }
     }
 
@@ -211,6 +211,25 @@ public class ReplaceNodeTest extends SolrCloudTestCase {
 
   }
 
+  @Test
+  public void testFailOnSingleNode() throws Exception {
+    configureCluster(1)
+        .addConfig("conf1", TEST_PATH().resolve("configsets").resolve("cloud-dynamic").resolve("conf"))
+        .configure();
+    String coll = "replacesinglenodetest_coll";
+    if (log.isInfoEnabled()) {
+      log.info("total_jettys: {}", cluster.getJettySolrRunners().size());
+    }
+
+    CloudSolrClient cloudClient = cluster.getSolrClient();
+    cloudClient.request(CollectionAdminRequest.createCollection(coll, "conf1", 5, 1,0,0).setMaxShardsPerNode(5));
+
+    cluster.waitForActiveCollection(coll, 5, 5);
+
+    String liveNode = cloudClient.getZkStateReader().getClusterState().getLiveNodes().iterator().next();
+    expectThrows(SolrException.class, () -> createReplaceNodeRequest(liveNode, null, null).process(cloudClient));
+  }
+
   public static  CollectionAdminRequest.AsyncCollectionAdminRequest createReplaceNodeRequest(String sourceNode, String targetNode, Boolean parallel) {
     if (random().nextBoolean()) {
       return new CollectionAdminRequest.ReplaceNode(sourceNode, targetNode).setParallel(parallel);