You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by er...@apache.org on 2020/07/24 20:50:27 UTC

[lucene-solr] branch branch_8x updated: SOLR-11656: TLOG replication doesn't work properly after rebalancing leaders.

This is an automated email from the ASF dual-hosted git repository.

erick pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new f95e485  SOLR-11656: TLOG replication doesn't work properly after rebalancing leaders.
f95e485 is described below

commit f95e485d9467a13df3abf1f9383abda92eab1486
Author: Erick Erickson <Er...@gmail.com>
AuthorDate: Fri Jul 24 13:10:35 2020 -0400

    SOLR-11656: TLOG replication doesn't work properly after rebalancing leaders.
---
 solr/CHANGES.txt                                   |  4 +-
 .../java/org/apache/solr/cloud/ZkController.java   | 14 ++++
 .../org/apache/solr/cloud/TestTlogReplica.java     | 77 ++++++++++++++++++++++
 3 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index b17a033..bd1e2eb 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -40,7 +40,9 @@ Optimizations
 
 Bug Fixes
 ---------------------
-(No changes)
+
+* SOLR-11656: TLOG replication doesn't work properly after rebalancing leaders. (Yuki Yano via
+  Erick Erickson)
 
 Other Changes
 ---------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 89ee55e..3be88d5 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -2238,6 +2238,20 @@ public class ZkController implements Closeable {
       electionContexts.put(contextKey, context);
 
       elect.retryElection(context, params.getBool(REJOIN_AT_HEAD_PROP, false));
+
+      try (SolrCore core = cc.getCore(coreName)) {
+        Replica.Type replicaType = core.getCoreDescriptor().getCloudDescriptor().getReplicaType();
+        if (replicaType == Type.TLOG) {
+          String leaderUrl = getLeader(core.getCoreDescriptor().getCloudDescriptor(), cloudConfig.getLeaderVoteWait());
+          String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
+          if (!leaderUrl.equals(ourUrl)) {
+            // restart the replication thread to ensure the replication is running in each new replica
+            // especially if previous role is "leader" (i.e., no replication thread)
+            stopReplicationFromLeader(coreName);
+            startReplicationFromLeader(coreName, false);
+          }
+        }
+      }
     } catch (Exception e) {
       throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to rejoin election", e);
     } finally {
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java
index fe30305..4e531ac 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java
@@ -47,6 +47,7 @@ import org.apache.solr.client.solrj.embedded.JettySolrRunner;
 import org.apache.solr.client.solrj.impl.CloudSolrClient;
 import org.apache.solr.client.solrj.impl.HttpSolrClient;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.QueryRequest;
 import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.client.solrj.response.CollectionAdminResponse;
 import org.apache.solr.client.solrj.response.QueryResponse;
@@ -59,6 +60,8 @@ import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkStateReader;
+import org.apache.solr.common.params.CollectionParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.TimeSource;
 import org.apache.solr.core.SolrCore;
@@ -677,7 +680,81 @@ public class TestTlogReplica extends SolrCloudTestCase {
         .commit(cloudClient, collectionName);
     waitForNumDocsInAllActiveReplicas(4, 0);
   }
+  public void testRebalanceLeaders() throws Exception {
+    createAndWaitForCollection(1,0,2,0);
+    CloudSolrClient cloudClient = cluster.getSolrClient();
+    new UpdateRequest()
+        .deleteByQuery("*:*")
+        .commit(cluster.getSolrClient(), collectionName);
+
+    // Find a replica which isn't leader
+    DocCollection docCollection = cloudClient.getZkStateReader().getClusterState().getCollection(collectionName);
+    Slice slice = docCollection.getSlices().iterator().next();
+    Replica newLeader = null;
+    for (Replica replica : slice.getReplicas()) {
+      if (slice.getLeader() == replica) continue;
+      newLeader = replica;
+      break;
+    }
+    assertNotNull("Failed to find a candidate of new leader", newLeader);
+
+    // Set preferredLeader flag to the replica
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set("action", CollectionParams.CollectionAction.ADDREPLICAPROP.toString());
+    params.set("collection", collectionName);
+    params.set("shard", slice.getName());
+    params.set("replica", newLeader.getName());
+    params.set("property", "preferredLeader");
+    params.set("property.value", "true");
+    QueryRequest request = new QueryRequest(params);
+    request.setPath("/admin/collections");
+    cloudClient.request(request);
+
+    // Wait until a preferredleader flag is set to the new leader candidate
+    TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+    while (!timeout.hasTimedOut()) {
+      Map<String, Slice> slices = cloudClient.getZkStateReader().getClusterState().getCollection(collectionName).getSlicesMap();
+      Replica me = slices.get(slice.getName()).getReplica(newLeader.getName());
+      if (me.getBool("property.preferredleader", false)) {
+        break;
+      }
+      Thread.sleep(100);
+    }
+    assertFalse("Timeout waiting for setting preferredleader flag", timeout.hasTimedOut());
 
+    // Rebalance leaders
+    params = new ModifiableSolrParams();
+    params.set("action", CollectionParams.CollectionAction.REBALANCELEADERS.toString());
+    params.set("collection", collectionName);
+    params.set("maxAtOnce", "10");
+    request = new QueryRequest(params);
+    request.setPath("/admin/collections");
+    cloudClient.request(request);
+
+    // Wait until a new leader is elected
+    timeout = new TimeOut(30, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+    while (!timeout.hasTimedOut()) {
+      docCollection = getCollectionState(collectionName);
+      Replica leader = docCollection.getSlice(slice.getName()).getLeader();
+      if (leader != null && leader.getName().equals(newLeader.getName()) &&
+          leader.isActive(cluster.getSolrClient().getZkStateReader().getClusterState().getLiveNodes())) {
+        break;
+      }
+      Thread.sleep(100);
+    }
+    assertFalse("Timeout waiting for a new leader to be elected", timeout.hasTimedOut());
+
+    new UpdateRequest()
+        .add(sdoc("id", "1"))
+        .add(sdoc("id", "2"))
+        .add(sdoc("id", "3"))
+        .add(sdoc("id", "4"))
+        .process(cloudClient, collectionName);
+    checkRTG(1,4, cluster.getJettySolrRunners());
+    new UpdateRequest()
+        .commit(cloudClient, collectionName);
+    waitForNumDocsInAllActiveReplicas(4);
+  }
   private void waitForLeaderChange(JettySolrRunner oldLeaderJetty, String shardName) {
     waitForState("Expect new leader", collectionName,
         (liveNodes, collectionState) -> {