You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2015/01/22 15:37:00 UTC
svn commit: r1653880 - in /lucene/dev/branches/branch_5x: ./ solr/ solr/CHANGES.txt solr/core/ solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java

Author: shalin
Date: Thu Jan 22 14:36:59 2015
New Revision: 1653880

URL: http://svn.apache.org/r1653880
Log:
SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState

Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/solr/   (props changed)
    lucene/dev/branches/branch_5x/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/solr/core/   (props changed)
    lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java

Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1653880&r1=1653879&r2=1653880&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Thu Jan 22 14:36:59 2015
@@ -485,6 +485,9 @@ Bug Fixes
 * SOLR-6640: Close searchers before rollback and recovery to avoid index corruption.
   (Robert Muir, Varun Thacker, shalin)
 
+* SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState.
+  (shalin)
+
 Optimizations
 ----------------------
 

Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java?rev=1653880&r1=1653879&r2=1653880&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java Thu Jan 22 14:36:59 2015
@@ -193,7 +193,7 @@ public class LeaderInitiatedRecoveryThre
 
         // additional safeguard against the replica trying to be in the active state
         // before acknowledging the leader initiated recovery command
-        if (continueTrying && collection != null && shardId != null) {
+        if (collection != null && shardId != null) {
           try {
             // call out to ZooKeeper to get the leader-initiated recovery state
             String lirState = 
@@ -218,20 +218,25 @@ public class LeaderInitiatedRecoveryThre
               List<ZkCoreNodeProps> replicaProps = 
                   zkStateReader.getReplicaProps(collection, shardId, leaderCoreNodeName);
               if (replicaProps != null && replicaProps.size() > 0) {
-                String replicaState = replicaProps.get(0).getState();
-                if (ZkStateReader.ACTIVE.equals(replicaState)) {
-                  // replica published its state as "active", 
-                  // which is bad if lirState is still "down"
-                  if (ZkStateReader.DOWN.equals(lirState)) {
-                    // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
-                    // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
-                    log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
-                        + " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
-                    zkController.ensureReplicaInLeaderInitiatedRecovery(collection, 
-                        shardId, replicaUrl, nodeProps, true); // force republish state to "down"
+                for (ZkCoreNodeProps prop : replicaProps) {
+                  if (replicaCoreNodeName.equals(((Replica) prop.getNodeProps()).getName())) {
+                    String replicaState = prop.getState();
+                    if (ZkStateReader.ACTIVE.equals(replicaState)) {
+                      // replica published its state as "active",
+                      // which is bad if lirState is still "down"
+                      if (ZkStateReader.DOWN.equals(lirState)) {
+                        // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
+                        // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
+                        log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
+                            + " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
+                        zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
+                            shardId, replicaUrl, nodeProps, true); // force republish state to "down"
+                      }
+                    }
+                    break;
                   }
-                }                    
-              }                    
+                }
+              }
             }                  
           } catch (Exception ignoreMe) {
             log.warn("Failed to determine state of core={} coreNodeName={} due to: "+ignoreMe, coreNeedingRecovery, replicaCoreNodeName);