You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2015/11/13 14:40:52 UTC
svn commit: r1714204 - in /lucene/dev/trunk/solr/core/src:
java/org/apache/solr/cloud/ElectionContext.java
test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnShardRestartTest.java
Author: markrmiller
Date: Fri Nov 13 13:40:52 2015
New Revision: 1714204
URL: http://svn.apache.org/viewvc?rev=1714204&view=rev
Log:
SOLR-8075: Fix faulty implementation.
Modified:
lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnShardRestartTest.java
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java?rev=1714204&r1=1714203&r2=1714204&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java Fri Nov 13 13:40:52 2015
@@ -393,14 +393,20 @@ final class ShardLeaderElectionContext e
return;
}
- log.info("I am the new leader: " + ZkCoreNodeProps.getCoreUrl(leaderProps) + " " + shardId);
- core.getCoreDescriptor().getCloudDescriptor().setLeader(true);
}
boolean isLeader = true;
if (!isClosed) {
try {
+ // we must check LIR before registering as leader
+ checkLIR(coreName, allReplicasInLine);
+
super.runLeaderProcess(weAreReplacement, 0);
+ try (SolrCore core = cc.getCore(coreName)) {
+ core.getCoreDescriptor().getCloudDescriptor().setLeader(true);
+ }
+ log.info("I am the new leader: " + ZkCoreNodeProps.getCoreUrl(leaderProps) + " " + shardId);
+
} catch (Exception e) {
isLeader = false;
SolrException.log(log, "There was a problem trying to register as the leader", e);
@@ -420,23 +426,6 @@ final class ShardLeaderElectionContext e
}
if (isLeader) {
- if (allReplicasInLine) {
- // SOLR-8075: A bug may allow the proper leader to get marked as LIR DOWN and
- // if we are marked as DOWN but were able to become the leader, we remove
- // the DOWN entry here so that we don't fail publishing ACTIVE due to being in LIR.
- // We only do this if all the replicas participated in the election just in case
- // this was a valid LIR entry and the proper leader replica is missing.
- try (SolrCore core = cc.getCore(coreName)) {
- final Replica.State lirState = zkController.getLeaderInitiatedRecoveryState(collection, shardId,
- core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
- if (lirState == Replica.State.DOWN) {
- zkController.updateLeaderInitiatedRecoveryState(collection, shardId,
- leaderProps.getStr(ZkStateReader.CORE_NODE_NAME_PROP), Replica.State.ACTIVE, null, true);
- }
- }
-
- }
-
// check for any replicas in my shard that were set to down by the previous leader
try {
startLeaderInitiatedRecoveryOnReplicas(coreName);
@@ -452,6 +441,41 @@ final class ShardLeaderElectionContext e
MDCLoggingContext.clear();
}
}
+
+ public void checkLIR(String coreName, boolean allReplicasInLine)
+ throws InterruptedException, KeeperException, IOException {
+ if (allReplicasInLine) {
+ // SOLR-8075: A bug may allow the proper leader to get marked as LIR DOWN and
+ // if we are marked as DOWN but were able to become the leader, we remove
+ // the DOWN entry here so that we don't fail publishing ACTIVE due to being in LIR.
+ // We only do this if all the replicas participated in the election just in case
+ // this was a valid LIR entry and the proper leader replica is missing.
+ try (SolrCore core = cc.getCore(coreName)) {
+ final Replica.State lirState = zkController.getLeaderInitiatedRecoveryState(collection, shardId,
+ core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
+ if (lirState == Replica.State.DOWN) {
+ // We can do this before registering as leader because only setting DOWN requires that
+ // we are already registered as leader, and here we are setting ACTIVE
+ // The fact that we just won the zk leader election provides a quasi lock on setting this state, but
+ // we should improve this: see SOLR-8075 discussion
+ zkController.updateLeaderInitiatedRecoveryState(collection, shardId,
+ leaderProps.getStr(ZkStateReader.CORE_NODE_NAME_PROP), Replica.State.ACTIVE, core.getCoreDescriptor(), true);
+ }
+ }
+
+ } else {
+ try (SolrCore core = cc.getCore(coreName)) {
+ final Replica.State lirState = zkController.getLeaderInitiatedRecoveryState(collection, shardId,
+ core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
+ if (lirState == Replica.State.DOWN || lirState == Replica.State.RECOVERING) {
+ log.warn("The previous leader marked me " + core.getName()
+ + " as " + lirState.toString() + " and I haven't recovered yet, so I shouldn't be the leader.");
+
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Leader Initiated Recovery prevented leadership");
+ }
+ }
+ }
+ }
private void startLeaderInitiatedRecoveryOnReplicas(String coreName) throws Exception {
try (SolrCore core = cc.getCore(coreName)) {
@@ -594,18 +618,6 @@ final class ShardLeaderElectionContext e
}
if (core.getCoreDescriptor().getCloudDescriptor().getLastPublished() == Replica.State.ACTIVE) {
-
- // maybe active but if the previous leader marked us as down and
- // we haven't recovered, then can't be leader
- final Replica.State lirState = zkController.getLeaderInitiatedRecoveryState(collection, shardId,
- core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
- if (lirState == Replica.State.DOWN || lirState == Replica.State.RECOVERING) {
- log.warn("Although my last published state is Active, the previous leader marked me "+core.getName()
- + " as " + lirState.toString()
- + " and I haven't recovered yet, so I shouldn't be the leader.");
- return false;
- }
-
log.info("My last published State was Active, it's okay to be the leader.");
return true;
}
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnShardRestartTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnShardRestartTest.java?rev=1714204&r1=1714203&r2=1714204&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnShardRestartTest.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnShardRestartTest.java Fri Nov 13 13:40:52 2015
@@ -26,6 +26,7 @@ import org.apache.solr.client.solrj.embe
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.Utils;
+import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -41,7 +42,7 @@ public class LeaderInitiatedRecoveryOnSh
public LeaderInitiatedRecoveryOnShardRestartTest() {
super();
sliceCount = 1;
- fixShardCount(2);
+ fixShardCount(3);
}
@Test
@@ -50,7 +51,7 @@ public class LeaderInitiatedRecoveryOnSh
String testCollectionName = "all_in_lir";
String shardId = "shard1";
- createCollection(testCollectionName, 1, 2, 1);
+ createCollection(testCollectionName, 1, 3, 1);
cloudClient.setDefaultCollection(testCollectionName);
@@ -64,8 +65,9 @@ public class LeaderInitiatedRecoveryOnSh
SolrZkClient zkClient = cloudClient.getZkStateReader().getZkClient();
zkClient.makePath("/collections/" + testCollectionName + "/leader_initiated_recovery/" + shardId + "/core_node1", znodeData, true);
zkClient.makePath("/collections/" + testCollectionName + "/leader_initiated_recovery/" + shardId + "/core_node2", znodeData, true);
+ zkClient.makePath("/collections/" + testCollectionName + "/leader_initiated_recovery/" + shardId + "/core_node3", znodeData, true);
- printLayout();
+ // printLayout();
for (JettySolrRunner jetty : jettys) {
ChaosMonkey.stop(jetty);
@@ -79,5 +81,31 @@ public class LeaderInitiatedRecoveryOnSh
// recoveries will not finish without SOLR-8075
waitForRecoveriesToFinish(testCollectionName, true);
+
+ // now expire each node
+ try {
+ zkClient.makePath("/collections/" + testCollectionName + "/leader_initiated_recovery/" + shardId + "/core_node1", znodeData, true);
+ } catch (NodeExistsException e) {
+
+ }
+ try {
+ zkClient.makePath("/collections/" + testCollectionName + "/leader_initiated_recovery/" + shardId + "/core_node2", znodeData, true);
+ } catch (NodeExistsException e) {
+
+ }
+ try {
+ zkClient.makePath("/collections/" + testCollectionName + "/leader_initiated_recovery/" + shardId + "/core_node3", znodeData, true);
+ } catch (NodeExistsException e) {
+
+ }
+
+ for (JettySolrRunner jetty : jettys) {
+ chaosMonkey.expireSession(jetty);
+ }
+
+ Thread.sleep(2000);
+
+ // recoveries will not finish without SOLR-8075
+ waitForRecoveriesToFinish(testCollectionName, true);
}
}