You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2015/02/27 19:59:57 UTC
svn commit: r1662797 - in /lucene/dev/branches/lucene_solr_4_10: ./ solr/ solr/CHANGES.txt solr/core/ solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java

Author: sarowe
Date: Fri Feb 27 18:59:57 2015
New Revision: 1662797

URL: http://svn.apache.org/r1662797
Log:
SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState (merged branch_5x r1653880)

Modified:
    lucene/dev/branches/lucene_solr_4_10/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/core/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java

Modified: lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt?rev=1662797&r1=1662796&r2=1662797&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt (original)
+++ lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt Fri Feb 27 18:59:57 2015
@@ -23,7 +23,7 @@ Bug Fixes
 ----------------------
 
 * SOLR-6931: We should do a limited retry when using HttpClient. 
- (Mark Miller, Hrishikesh Gadre, Gregory Chanan)
+  (Mark Miller, Hrishikesh Gadre, Gregory Chanan)
 
 * SOLR-6780: Fixed a bug in how default/appends/invariants params were affecting the set 
   of all "keys" found in the request parameters, resulting in some key=value param pairs 
@@ -94,6 +94,9 @@ Bug Fixes
   even when recoveries are rapidly stopped and started as well as when a
   node attempts to become the leader for a shard. 
   (Mark Miller, Maxim Novikov)
+  
+* SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState.
+  (shalin)
 
 Other Changes
 ----------------------

Modified: lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java?rev=1662797&r1=1662796&r2=1662797&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java Fri Feb 27 18:59:57 2015
@@ -193,7 +193,7 @@ public class LeaderInitiatedRecoveryThre
 
         // additional safeguard against the replica trying to be in the active state
         // before acknowledging the leader initiated recovery command
-        if (continueTrying && collection != null && shardId != null) {
+        if (collection != null && shardId != null) {
           try {
             // call out to ZooKeeper to get the leader-initiated recovery state
             String lirState = 
@@ -218,20 +218,25 @@ public class LeaderInitiatedRecoveryThre
               List<ZkCoreNodeProps> replicaProps = 
                   zkStateReader.getReplicaProps(collection, shardId, leaderCoreNodeName);
               if (replicaProps != null && replicaProps.size() > 0) {
-                String replicaState = replicaProps.get(0).getState();
-                if (ZkStateReader.ACTIVE.equals(replicaState)) {
-                  // replica published its state as "active", 
-                  // which is bad if lirState is still "down"
-                  if (ZkStateReader.DOWN.equals(lirState)) {
-                    // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
-                    // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
-                    log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
-                        + " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
-                    zkController.ensureReplicaInLeaderInitiatedRecovery(collection, 
-                        shardId, replicaUrl, nodeProps, true); // force republish state to "down"
+                for (ZkCoreNodeProps prop : replicaProps) {
+                  if (replicaCoreNodeName.equals(((Replica) prop.getNodeProps()).getName())) {
+                    String replicaState = prop.getState();
+                    if (ZkStateReader.ACTIVE.equals(replicaState)) {
+                      // replica published its state as "active",
+                      // which is bad if lirState is still "down"
+                      if (ZkStateReader.DOWN.equals(lirState)) {
+                        // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
+                        // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
+                        log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
+                            + " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
+                        zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
+                            shardId, replicaUrl, nodeProps, true); // force republish state to "down"
+                      }
+                    }
+                    break;
                   }
-                }                    
-              }                    
+                }
+              }
             }                  
           } catch (Exception ignoreMe) {
             log.warn("Failed to determine state of core={} coreNodeName={} due to: "+ignoreMe, coreNeedingRecovery, replicaCoreNodeName);