You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2015/01/22 15:37:00 UTC
svn commit: r1653880 - in /lucene/dev/branches/branch_5x: ./ solr/
solr/CHANGES.txt solr/core/
solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
Author: shalin
Date: Thu Jan 22 14:36:59 2015
New Revision: 1653880
URL: http://svn.apache.org/r1653880
Log:
SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/solr/ (props changed)
lucene/dev/branches/branch_5x/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/solr/core/ (props changed)
lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1653880&r1=1653879&r2=1653880&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Thu Jan 22 14:36:59 2015
@@ -485,6 +485,9 @@ Bug Fixes
* SOLR-6640: Close searchers before rollback and recovery to avoid index corruption.
(Robert Muir, Varun Thacker, shalin)
+* SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState.
+ (shalin)
+
Optimizations
----------------------
Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java?rev=1653880&r1=1653879&r2=1653880&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java Thu Jan 22 14:36:59 2015
@@ -193,7 +193,7 @@ public class LeaderInitiatedRecoveryThre
// additional safeguard against the replica trying to be in the active state
// before acknowledging the leader initiated recovery command
- if (continueTrying && collection != null && shardId != null) {
+ if (collection != null && shardId != null) {
try {
// call out to ZooKeeper to get the leader-initiated recovery state
String lirState =
@@ -218,20 +218,25 @@ public class LeaderInitiatedRecoveryThre
List<ZkCoreNodeProps> replicaProps =
zkStateReader.getReplicaProps(collection, shardId, leaderCoreNodeName);
if (replicaProps != null && replicaProps.size() > 0) {
- String replicaState = replicaProps.get(0).getState();
- if (ZkStateReader.ACTIVE.equals(replicaState)) {
- // replica published its state as "active",
- // which is bad if lirState is still "down"
- if (ZkStateReader.DOWN.equals(lirState)) {
- // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
- // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
- log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
- + " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
- zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
- shardId, replicaUrl, nodeProps, true); // force republish state to "down"
+ for (ZkCoreNodeProps prop : replicaProps) {
+ if (replicaCoreNodeName.equals(((Replica) prop.getNodeProps()).getName())) {
+ String replicaState = prop.getState();
+ if (ZkStateReader.ACTIVE.equals(replicaState)) {
+ // replica published its state as "active",
+ // which is bad if lirState is still "down"
+ if (ZkStateReader.DOWN.equals(lirState)) {
+ // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
+ // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
+ log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
+ + " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
+ zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
+ shardId, replicaUrl, nodeProps, true); // force republish state to "down"
+ }
+ }
+ break;
}
- }
- }
+ }
+ }
}
} catch (Exception ignoreMe) {
log.warn("Failed to determine state of core={} coreNodeName={} due to: "+ignoreMe, coreNeedingRecovery, replicaCoreNodeName);