You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2015/10/04 18:55:37 UTC
svn commit: r1706701 - in /lucene/dev/branches/branch_5x: ./ solr/
solr/core/ solr/core/src/java/org/apache/solr/cloud/
solr/core/src/test-files/solr/ solr/core/src/test/org/apache/solr/cloud/
Author: markrmiller
Date: Sun Oct 4 16:55:36 2015
New Revision: 1706701
URL: http://svn.apache.org/viewvc?rev=1706701&view=rev
Log:
SOLR-8075: Leader Initiated Recovery should not stop a leader that participated in an election with all of it's replicas from becoming a valid leader.
Added:
lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnShardRestartTest.java
- copied unchanged from r1706699, lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnShardRestartTest.java
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/solr/ (props changed)
lucene/dev/branches/branch_5x/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/solr/core/ (props changed)
lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
lucene/dev/branches/branch_5x/solr/core/src/test-files/solr/solr.xml
Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1706701&r1=1706700&r2=1706701&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Sun Oct 4 16:55:36 2015
@@ -129,6 +129,9 @@ Bug Fixes
* SOLR-8094: HdfsUpdateLog should not replay buffered documents as a replacement to dropping them.
(Mark Miller)
+* SOLR-8075: Leader Initiated Recovery should not stop a leader that participated in an election with all
+ of it's replicas from becoming a valid leader. (Mark Miller)
+
Optimizations
----------------------
Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java?rev=1706701&r1=1706700&r2=1706701&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java Sun Oct 4 16:55:36 2015
@@ -288,8 +288,9 @@ final class ShardLeaderElectionContext e
Overseer.getInQueue(zkClient).offer(Utils.toJSON(m));
int leaderVoteWait = cc.getZkController().getLeaderVoteWait();
+ boolean allReplicasInLine = false;
if (!weAreReplacement) {
- waitForReplicasToComeUp(leaderVoteWait);
+ allReplicasInLine = waitForReplicasToComeUp(leaderVoteWait);
}
if (isClosed) {
@@ -408,6 +409,23 @@ final class ShardLeaderElectionContext e
}
if (isLeader) {
+ if (allReplicasInLine) {
+ // SOLR-8075: A bug may allow the proper leader to get marked as LIR DOWN and
+ // if we are marked as DOWN but were able to become the leader, we remove
+ // the DOWN entry here so that we don't fail publishing ACTIVE due to being in LIR.
+ // We only do this if all the replicas participated in the election just in case
+ // this was a valid LIR entry and the proper leader replica is missing.
+ try (SolrCore core = cc.getCore(coreName)) {
+ final Replica.State lirState = zkController.getLeaderInitiatedRecoveryState(collection, shardId,
+ core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
+ if (lirState == Replica.State.DOWN) {
+ zkController.updateLeaderInitiatedRecoveryState(collection, shardId,
+ leaderProps.getStr(ZkStateReader.CORE_NODE_NAME_PROP), Replica.State.ACTIVE, null, true);
+ }
+ }
+
+ }
+
// check for any replicas in my shard that were set to down by the previous leader
try {
startLeaderInitiatedRecoveryOnReplicas(coreName);
@@ -477,7 +495,8 @@ final class ShardLeaderElectionContext e
} // core gets closed automagically
}
- private void waitForReplicasToComeUp(int timeoutms) throws InterruptedException {
+ // returns true if all replicas are found to be up, false if not
+ private boolean waitForReplicasToComeUp(int timeoutms) throws InterruptedException {
long timeoutAt = System.nanoTime() + TimeUnit.NANOSECONDS.convert(timeoutms, TimeUnit.MILLISECONDS);
final String shardsElectZkPath = electionPath + LeaderElector.ELECTION_NODE;
@@ -503,7 +522,7 @@ final class ShardLeaderElectionContext e
// on startup and after connection timeout, wait for all known shards
if (found >= slices.getReplicasMap().size()) {
log.info("Enough replicas found to continue.");
- return;
+ return true;
} else {
if (cnt % 40 == 0) {
log.info("Waiting until we see more replicas up for shard {}: total={}"
@@ -516,12 +535,12 @@ final class ShardLeaderElectionContext e
if (System.nanoTime() > timeoutAt) {
log.info("Was waiting for replicas to come up, but they are taking too long - assuming they won't come back till later");
- return;
+ return false;
}
} else {
log.warn("Shard not found: " + shardId + " for collection " + collection);
- return;
+ return false;
}
@@ -529,6 +548,7 @@ final class ShardLeaderElectionContext e
slices = zkController.getClusterState().getSlice(collection, shardId);
cnt++;
}
+ return false;
}
private void rejoinLeaderElection(SolrCore core)
Modified: lucene/dev/branches/branch_5x/solr/core/src/test-files/solr/solr.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/test-files/solr/solr.xml?rev=1706701&r1=1706700&r2=1706701&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/test-files/solr/solr.xml (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/test-files/solr/solr.xml Sun Oct 4 16:55:36 2015
@@ -37,7 +37,7 @@
<str name="hostContext">${hostContext:solr}</str>
<int name="zkClientTimeout">${solr.zkclienttimeout:30000}</int>
<bool name="genericCoreNodeNames">${genericCoreNodeNames:true}</bool>
- <int name="leaderVoteWait">0</int>
+ <int name="leaderVoteWait">10000</int>
<int name="distribUpdateConnTimeout">${distribUpdateConnTimeout:45000}</int>
<int name="distribUpdateSoTimeout">${distribUpdateSoTimeout:340000}</int>
</solrcloud>