You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2015/10/04 18:55:37 UTC

svn commit: r1706701 - in /lucene/dev/branches/branch_5x: ./ solr/ solr/core/ solr/core/src/java/org/apache/solr/cloud/ solr/core/src/test-files/solr/ solr/core/src/test/org/apache/solr/cloud/

Author: markrmiller
Date: Sun Oct  4 16:55:36 2015
New Revision: 1706701

URL: http://svn.apache.org/viewvc?rev=1706701&view=rev
Log:
SOLR-8075: Leader Initiated Recovery should not stop a leader that participated in an election with all of it's replicas from becoming a valid leader.

Added:
    lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnShardRestartTest.java
      - copied unchanged from r1706699, lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnShardRestartTest.java
Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/solr/   (props changed)
    lucene/dev/branches/branch_5x/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/solr/core/   (props changed)
    lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
    lucene/dev/branches/branch_5x/solr/core/src/test-files/solr/solr.xml

Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1706701&r1=1706700&r2=1706701&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Sun Oct  4 16:55:36 2015
@@ -129,6 +129,9 @@ Bug Fixes
 * SOLR-8094: HdfsUpdateLog should not replay buffered documents as a replacement to dropping them.
   (Mark Miller)
 
+* SOLR-8075: Leader Initiated Recovery should not stop a leader that participated in an election with all
+  of it's replicas from becoming a valid leader. (Mark Miller)
+
 Optimizations
 ----------------------
 

Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java?rev=1706701&r1=1706700&r2=1706701&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java Sun Oct  4 16:55:36 2015
@@ -288,8 +288,9 @@ final class ShardLeaderElectionContext e
       Overseer.getInQueue(zkClient).offer(Utils.toJSON(m));
       
       int leaderVoteWait = cc.getZkController().getLeaderVoteWait();
+      boolean allReplicasInLine = false;
       if (!weAreReplacement) {
-        waitForReplicasToComeUp(leaderVoteWait);
+        allReplicasInLine = waitForReplicasToComeUp(leaderVoteWait);
       }
       
       if (isClosed) {
@@ -408,6 +409,23 @@ final class ShardLeaderElectionContext e
         }
         
         if (isLeader) {
+          if (allReplicasInLine) {
+            // SOLR-8075: A bug may allow the proper leader to get marked as LIR DOWN and
+            // if we are marked as DOWN but were able to become the leader, we remove
+            // the DOWN entry here so that we don't fail publishing ACTIVE due to being in LIR.
+            // We only do this if all the replicas participated in the election just in case
+            // this was a valid LIR entry and the proper leader replica is missing.
+            try (SolrCore core = cc.getCore(coreName)) {
+              final Replica.State lirState = zkController.getLeaderInitiatedRecoveryState(collection, shardId,
+                  core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
+              if (lirState == Replica.State.DOWN) {
+                zkController.updateLeaderInitiatedRecoveryState(collection, shardId,
+                    leaderProps.getStr(ZkStateReader.CORE_NODE_NAME_PROP), Replica.State.ACTIVE, null, true);
+              }
+            }
+            
+          }
+          
           // check for any replicas in my shard that were set to down by the previous leader
           try {
             startLeaderInitiatedRecoveryOnReplicas(coreName);
@@ -477,7 +495,8 @@ final class ShardLeaderElectionContext e
     } // core gets closed automagically    
   }
 
-  private void waitForReplicasToComeUp(int timeoutms) throws InterruptedException {
+  // returns true if all replicas are found to be up, false if not
+  private boolean waitForReplicasToComeUp(int timeoutms) throws InterruptedException {
     long timeoutAt = System.nanoTime() + TimeUnit.NANOSECONDS.convert(timeoutms, TimeUnit.MILLISECONDS);
     final String shardsElectZkPath = electionPath + LeaderElector.ELECTION_NODE;
     
@@ -503,7 +522,7 @@ final class ShardLeaderElectionContext e
         // on startup and after connection timeout, wait for all known shards
         if (found >= slices.getReplicasMap().size()) {
           log.info("Enough replicas found to continue.");
-          return;
+          return true;
         } else {
           if (cnt % 40 == 0) {
             log.info("Waiting until we see more replicas up for shard {}: total={}"
@@ -516,12 +535,12 @@ final class ShardLeaderElectionContext e
         
         if (System.nanoTime() > timeoutAt) {
           log.info("Was waiting for replicas to come up, but they are taking too long - assuming they won't come back till later");
-          return;
+          return false;
         }
       } else {
         log.warn("Shard not found: " + shardId + " for collection " + collection);
 
-        return;
+        return false;
 
       }
       
@@ -529,6 +548,7 @@ final class ShardLeaderElectionContext e
       slices = zkController.getClusterState().getSlice(collection, shardId);
       cnt++;
     }
+    return false;
   }
 
   private void rejoinLeaderElection(SolrCore core)

Modified: lucene/dev/branches/branch_5x/solr/core/src/test-files/solr/solr.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/test-files/solr/solr.xml?rev=1706701&r1=1706700&r2=1706701&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/test-files/solr/solr.xml (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/test-files/solr/solr.xml Sun Oct  4 16:55:36 2015
@@ -37,7 +37,7 @@
     <str name="hostContext">${hostContext:solr}</str>
     <int name="zkClientTimeout">${solr.zkclienttimeout:30000}</int>
     <bool name="genericCoreNodeNames">${genericCoreNodeNames:true}</bool>
-    <int name="leaderVoteWait">0</int>
+    <int name="leaderVoteWait">10000</int>
     <int name="distribUpdateConnTimeout">${distribUpdateConnTimeout:45000}</int>
     <int name="distribUpdateSoTimeout">${distribUpdateSoTimeout:340000}</int>
   </solrcloud>