You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2015/02/27 19:43:44 UTC

svn commit: r1662784 - in /lucene/dev/branches/lucene_solr_4_10: ./ solr/ solr/core/ solr/core/src/java/org/apache/solr/cloud/ solr/core/src/java/org/apache/solr/update/ solr/core/src/test/org/apache/solr/cloud/

Author: sarowe
Date: Fri Feb 27 18:43:43 2015
New Revision: 1662784

URL: http://svn.apache.org/r1662784
Log:
SOLR-7033, SOLR-5961: RecoveryStrategy should not publish any state when closed / cancelled and there should always be a pause between recoveries even when recoveries are rapidly stopped and started as well as when a node attempts to become the leader for a shard. (merged branch_5x r1658237)

Added:
    lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java
      - copied unchanged from r1658237, lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java
    lucene/dev/branches/lucene_solr_4_10/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java
      - copied unchanged from r1658237, lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java
Modified:
    lucene/dev/branches/lucene_solr_4_10/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/core/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
    lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
    lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
    lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/update/SolrCoreState.java

Modified: lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt?rev=1662784&r1=1662783&r2=1662784&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt (original)
+++ lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt Fri Feb 27 18:43:43 2015
@@ -89,6 +89,12 @@ Bug Fixes
   
 * SOLR-7067: bin/solr won't run under bash 4.2+. (Steve Rowe)
 
+* SOLR-7033, SOLR-5961: RecoveryStrategy should not publish any state when
+  closed / cancelled and there should always be a pause between recoveries 
+  even when recoveries are rapidly stopped and started as well as when a
+  node attempts to become the leader for a shard. 
+  (Mark Miller, Maxim Novikov)
+
 Other Changes
 ----------------------
 

Modified: lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java?rev=1662784&r1=1662783&r2=1662784&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java Fri Feb 27 18:43:43 2015
@@ -194,6 +194,21 @@ final class ShardLeaderElectionContext e
     log.info("Running the leader process for shard " + shardId);
     
     String coreName = leaderProps.getStr(ZkStateReader.CORE_NAME_PROP);
+    ActionThrottle lt;
+    try (SolrCore core = cc.getCore(coreName)) {
+
+      if (core == null) {
+        cancelElection();
+        throw new SolrException(ErrorCode.SERVER_ERROR,
+            "SolrCore not found:" + coreName + " in "
+                + cc.getCoreNames());
+      }
+      
+      lt = core.getUpdateHandler().getSolrCoreState().getLeaderThrottle();
+    }
+    
+    lt.minimumWaitBetweenActions();
+    lt.markAttemptingAction();
     
     // clear the leader in clusterstate
     ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, ZkStateReader.LEADER_PROP,
@@ -211,7 +226,7 @@ final class ShardLeaderElectionContext e
       if (core == null) {
         cancelElection();
         throw new SolrException(ErrorCode.SERVER_ERROR,
-            "Fatal Error, SolrCore not found:" + coreName + " in "
+            "SolrCore not found:" + coreName + " in "
                 + cc.getCoreNames());
       }
       

Modified: lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java?rev=1662784&r1=1662783&r2=1662784&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java Fri Feb 27 18:43:43 2015
@@ -68,8 +68,7 @@ import java.util.concurrent.Future;
 
 public class RecoveryStrategy extends Thread implements ClosableThread {
   private static final int MAX_RETRIES = 500;
-  private static final int INTERRUPTED = MAX_RETRIES + 1;
-  private static final int STARTING_RECOVERY_DELAY = 1000;
+  private static final int STARTING_RECOVERY_DELAY = 5000;
   
   private static final String REPLICATION_HANDLER = "/replication";
 
@@ -93,6 +92,7 @@ public class RecoveryStrategy extends Th
   private CoreContainer cc;
   private volatile HttpUriRequest prevSendPreRecoveryHttpUriRequest;
   
+  // this should only be used from SolrCoreState
   public RecoveryStrategy(CoreContainer cc, CoreDescriptor cd, RecoveryListener recoveryListener) {
     this.cc = cc;
     this.coreName = cd.getName();
@@ -159,7 +159,7 @@ public class RecoveryStrategy extends Th
     ModifiableSolrParams solrParams = new ModifiableSolrParams();
     solrParams.set(ReplicationHandler.MASTER_URL, leaderUrl);
     
-    if (isClosed()) retries = INTERRUPTED;
+    if (isClosed()) return; // we check closed on return
     boolean success = replicationHandler.doFetch(solrParams, false);
     
     if (!success) {
@@ -236,12 +236,10 @@ public class RecoveryStrategy extends Th
       } catch (InterruptedException e) {
         Thread.currentThread().interrupt();
         SolrException.log(log, "", e);
-        throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "",
-            e);
+        throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
       } catch (Exception e) {
         log.error("", e);
-        throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR,
-            "", e);
+        throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
       }
     } finally {
       SolrRequestInfo.clearRequestInfo();
@@ -468,7 +466,7 @@ public class RecoveryStrategy extends Th
         } catch (InterruptedException e) {
           Thread.currentThread().interrupt();
           log.warn("Recovery was interrupted", e);
-          retries = INTERRUPTED;
+          close = true;
         } catch (Exception e) {
           SolrException.log(log, "Error while trying to recover", e);
         } finally {
@@ -492,38 +490,22 @@ public class RecoveryStrategy extends Th
         // Or do a fall off retry...
         try {
 
-          log.error("Recovery failed - trying again... (" + retries + ") core=" + coreName);
-          
           if (isClosed()) {
-            retries = INTERRUPTED;
+            break;
           }
           
+          log.error("Recovery failed - trying again... (" + retries + ") core=" + coreName);
+          
           retries++;
           if (retries >= MAX_RETRIES) {
-            if (retries >= INTERRUPTED) {
-              SolrException.log(log, "Recovery failed - interrupted. core="
-                  + coreName);
-              try {
-                recoveryFailed(core, zkController, baseUrl, coreZkNodeName,
-                    core.getCoreDescriptor());
-              } catch (Exception e) {
-                SolrException.log(log,
-                    "Could not publish that recovery failed", e);
-              }
-            } else {
-              SolrException.log(log,
-                  "Recovery failed - max retries exceeded (" + retries + "). core=" + coreName);
-              try {
-                recoveryFailed(core, zkController, baseUrl, coreZkNodeName,
-                    core.getCoreDescriptor());
-              } catch (Exception e) {
-                SolrException.log(log,
-                    "Could not publish that recovery failed", e);
-              }
+            SolrException.log(log, "Recovery failed - max retries exceeded (" + retries + "). core=" + coreName);
+            try {
+              recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
+            } catch (Exception e) {
+              SolrException.log(log, "Could not publish that recovery failed", e);
             }
             break;
           }
-
         } catch (Exception e) {
           SolrException.log(log, "core=" + coreName, e);
         }
@@ -539,7 +521,7 @@ public class RecoveryStrategy extends Th
         } catch (InterruptedException e) {
           Thread.currentThread().interrupt();
           log.warn("Recovery was interrupted. core=" + coreName, e);
-          retries = INTERRUPTED;
+          close = true;
         }
       }
 

Modified: lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java?rev=1662784&r1=1662783&r2=1662784&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java Fri Feb 27 18:43:43 2015
@@ -24,6 +24,7 @@ import java.util.concurrent.locks.Reentr
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.store.Directory;
 import org.apache.solr.cloud.RecoveryStrategy;
+import org.apache.solr.cloud.ActionThrottle;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.core.CoreContainer;
@@ -42,6 +43,10 @@ public final class DefaultSolrCoreState
   
   private final Object recoveryLock = new Object();
   
+  private final ActionThrottle recoveryThrottle = new ActionThrottle("recovery", 10000);
+  
+  private final ActionThrottle leaderThrottle = new ActionThrottle("leader", 5000);
+  
   // protects pauseWriter and writerFree
   private final Object writerPauseLock = new Object();
   
@@ -325,6 +330,9 @@ public final class DefaultSolrCoreState
       // if true, we are recovering after startup and shouldn't have (or be receiving) additional updates (except for local tlog recovery)
       boolean recoveringAfterStartup = recoveryStrat == null;
 
+      recoveryThrottle.minimumWaitBetweenActions();
+      recoveryThrottle.markAttemptingAction();
+      
       recoveryStrat = new RecoveryStrategy(cc, cd, this);
       recoveryStrat.setRecoveringAfterStartup(recoveringAfterStartup);
       recoveryStrat.start();
@@ -376,4 +384,10 @@ public final class DefaultSolrCoreState
     return commitLock;
   }
   
+  @Override
+  public ActionThrottle getLeaderThrottle() {
+    return leaderThrottle;
+  }
+  
+  
 }

Modified: lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/update/SolrCoreState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/update/SolrCoreState.java?rev=1662784&r1=1662783&r2=1662784&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/update/SolrCoreState.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/solr/core/src/java/org/apache/solr/update/SolrCoreState.java Fri Feb 27 18:43:43 2015
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.util.concurrent.locks.Lock;
 
 import org.apache.lucene.index.IndexWriter;
+import org.apache.solr.cloud.ActionThrottle;
 import org.apache.solr.core.CoreContainer;
 import org.apache.solr.core.CoreDescriptor;
 import org.apache.solr.core.DirectoryFactory;
@@ -140,4 +141,9 @@ public abstract class SolrCoreState {
 
   public abstract void close(IndexWriterCloser closer);
 
+  /**
+   * @return throttle to limit how fast a core attempts to become leader
+   */
+  public abstract ActionThrottle getLeaderThrottle();
+
 }