You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2015/02/27 19:59:57 UTC
svn commit: r1662797 - in /lucene/dev/branches/lucene_solr_4_10: ./ solr/
solr/CHANGES.txt solr/core/
solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
Author: sarowe
Date: Fri Feb 27 18:59:57 2015
New Revision: 1662797
URL: http://svn.apache.org/r1662797
Log:
SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState (merged branch_5x r1653880)
Modified:
lucene/dev/branches/lucene_solr_4_10/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/lucene_solr_4_10/solr/core/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
Modified: lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt?rev=1662797&r1=1662796&r2=1662797&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt (original)
+++ lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt Fri Feb 27 18:59:57 2015
@@ -23,7 +23,7 @@ Bug Fixes
----------------------
* SOLR-6931: We should do a limited retry when using HttpClient.
- (Mark Miller, Hrishikesh Gadre, Gregory Chanan)
+ (Mark Miller, Hrishikesh Gadre, Gregory Chanan)
* SOLR-6780: Fixed a bug in how default/appends/invariants params were affecting the set
of all "keys" found in the request parameters, resulting in some key=value param pairs
@@ -94,6 +94,9 @@ Bug Fixes
even when recoveries are rapidly stopped and started as well as when a
node attempts to become the leader for a shard.
(Mark Miller, Maxim Novikov)
+
+* SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState.
+ (shalin)
Other Changes
----------------------
Modified: lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java?rev=1662797&r1=1662796&r2=1662797&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java Fri Feb 27 18:59:57 2015
@@ -193,7 +193,7 @@ public class LeaderInitiatedRecoveryThre
// additional safeguard against the replica trying to be in the active state
// before acknowledging the leader initiated recovery command
- if (continueTrying && collection != null && shardId != null) {
+ if (collection != null && shardId != null) {
try {
// call out to ZooKeeper to get the leader-initiated recovery state
String lirState =
@@ -218,20 +218,25 @@ public class LeaderInitiatedRecoveryThre
List<ZkCoreNodeProps> replicaProps =
zkStateReader.getReplicaProps(collection, shardId, leaderCoreNodeName);
if (replicaProps != null && replicaProps.size() > 0) {
- String replicaState = replicaProps.get(0).getState();
- if (ZkStateReader.ACTIVE.equals(replicaState)) {
- // replica published its state as "active",
- // which is bad if lirState is still "down"
- if (ZkStateReader.DOWN.equals(lirState)) {
- // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
- // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
- log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
- + " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
- zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
- shardId, replicaUrl, nodeProps, true); // force republish state to "down"
+ for (ZkCoreNodeProps prop : replicaProps) {
+ if (replicaCoreNodeName.equals(((Replica) prop.getNodeProps()).getName())) {
+ String replicaState = prop.getState();
+ if (ZkStateReader.ACTIVE.equals(replicaState)) {
+ // replica published its state as "active",
+ // which is bad if lirState is still "down"
+ if (ZkStateReader.DOWN.equals(lirState)) {
+ // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
+ // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
+ log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
+ + " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
+ zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
+ shardId, replicaUrl, nodeProps, true); // force republish state to "down"
+ }
+ }
+ break;
}
- }
- }
+ }
+ }
}
} catch (Exception ignoreMe) {
log.warn("Failed to determine state of core={} coreNodeName={} due to: "+ignoreMe, coreNeedingRecovery, replicaCoreNodeName);