You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2015/02/08 19:37:52 UTC
svn commit: r1658237 - in /lucene/dev/branches/branch_5x: ./ solr/
solr/core/ solr/core/src/java/org/apache/solr/cloud/
solr/core/src/java/org/apache/solr/update/
solr/core/src/test/org/apache/solr/cloud/
Author: markrmiller
Date: Sun Feb 8 18:37:52 2015
New Revision: 1658237
URL: http://svn.apache.org/r1658237
Log:
SOLR-7033, SOLR-5961: RecoveryStrategy should not publish any state when closed / cancelled and there should always be a pause between recoveries even when recoveries are rapidly stopped and started as well as when a node attempts to become the leader for a shard.
Added:
lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java
- copied unchanged from r1658236, lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/ActionThrottle.java
lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java
- copied unchanged from r1658236, lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/ActionThrottleTest.java
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/solr/ (props changed)
lucene/dev/branches/branch_5x/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/solr/core/ (props changed)
lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/SolrCoreState.java
Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1658237&r1=1658236&r2=1658237&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Sun Feb 8 18:37:52 2015
@@ -566,6 +566,12 @@ Bug Fixes
* SOLR-6920: A replicated index can end up corrupted when small files end up with the same
file name and size. (Varun Thacker, Mark Miller)
+* SOLR-7033, SOLR-5961: RecoveryStrategy should not publish any state when
+ closed / cancelled and there should always be a pause between recoveries
+ even when recoveries are rapidly stopped and started as well as when a
+ node attempts to become the leader for a shard.
+ (Mark Miller, Maxim Novikov)
+
Optimizations
----------------------
Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java?rev=1658237&r1=1658236&r2=1658237&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java Sun Feb 8 18:37:52 2015
@@ -200,6 +200,21 @@ final class ShardLeaderElectionContext e
log.info("Running the leader process for shard " + shardId);
String coreName = leaderProps.getStr(ZkStateReader.CORE_NAME_PROP);
+ ActionThrottle lt;
+ try (SolrCore core = cc.getCore(coreName)) {
+
+ if (core == null) {
+ cancelElection();
+ throw new SolrException(ErrorCode.SERVER_ERROR,
+ "SolrCore not found:" + coreName + " in "
+ + cc.getCoreNames());
+ }
+
+ lt = core.getUpdateHandler().getSolrCoreState().getLeaderThrottle();
+ }
+
+ lt.minimumWaitBetweenActions();
+ lt.markAttemptingAction();
// clear the leader in clusterstate
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.LEADER.toLower(),
@@ -217,7 +232,7 @@ final class ShardLeaderElectionContext e
if (core == null) {
cancelElection();
throw new SolrException(ErrorCode.SERVER_ERROR,
- "Fatal Error, SolrCore not found:" + coreName + " in "
+ "SolrCore not found:" + coreName + " in "
+ cc.getCoreNames());
}
Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java?rev=1658237&r1=1658236&r2=1658237&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java Sun Feb 8 18:37:52 2015
@@ -68,8 +68,7 @@ import java.util.concurrent.Future;
public class RecoveryStrategy extends Thread implements ClosableThread {
private static final int MAX_RETRIES = 500;
- private static final int INTERRUPTED = MAX_RETRIES + 1;
- private static final int STARTING_RECOVERY_DELAY = 1000;
+ private static final int STARTING_RECOVERY_DELAY = 5000;
private static final String REPLICATION_HANDLER = "/replication";
@@ -93,6 +92,7 @@ public class RecoveryStrategy extends Th
private CoreContainer cc;
private volatile HttpUriRequest prevSendPreRecoveryHttpUriRequest;
+ // this should only be used from SolrCoreState
public RecoveryStrategy(CoreContainer cc, CoreDescriptor cd, RecoveryListener recoveryListener) {
this.cc = cc;
this.coreName = cd.getName();
@@ -159,7 +159,7 @@ public class RecoveryStrategy extends Th
ModifiableSolrParams solrParams = new ModifiableSolrParams();
solrParams.set(ReplicationHandler.MASTER_URL, leaderUrl);
- if (isClosed()) retries = INTERRUPTED;
+ if (isClosed()) return; // we check closed on return
boolean success = replicationHandler.doFetch(solrParams, false);
if (!success) {
@@ -233,12 +233,10 @@ public class RecoveryStrategy extends Th
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
SolrException.log(log, "", e);
- throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "",
- e);
+ throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
} catch (Exception e) {
log.error("", e);
- throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR,
- "", e);
+ throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
}
} finally {
SolrRequestInfo.clearRequestInfo();
@@ -465,7 +463,7 @@ public class RecoveryStrategy extends Th
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
log.warn("Recovery was interrupted", e);
- retries = INTERRUPTED;
+ close = true;
} catch (Exception e) {
SolrException.log(log, "Error while trying to recover", e);
} finally {
@@ -489,38 +487,22 @@ public class RecoveryStrategy extends Th
// Or do a fall off retry...
try {
- log.error("Recovery failed - trying again... (" + retries + ") core=" + coreName);
-
if (isClosed()) {
- retries = INTERRUPTED;
+ break;
}
+ log.error("Recovery failed - trying again... (" + retries + ") core=" + coreName);
+
retries++;
if (retries >= MAX_RETRIES) {
- if (retries >= INTERRUPTED) {
- SolrException.log(log, "Recovery failed - interrupted. core="
- + coreName);
- try {
- recoveryFailed(core, zkController, baseUrl, coreZkNodeName,
- core.getCoreDescriptor());
- } catch (Exception e) {
- SolrException.log(log,
- "Could not publish that recovery failed", e);
- }
- } else {
- SolrException.log(log,
- "Recovery failed - max retries exceeded (" + retries + "). core=" + coreName);
- try {
- recoveryFailed(core, zkController, baseUrl, coreZkNodeName,
- core.getCoreDescriptor());
- } catch (Exception e) {
- SolrException.log(log,
- "Could not publish that recovery failed", e);
- }
+ SolrException.log(log, "Recovery failed - max retries exceeded (" + retries + "). core=" + coreName);
+ try {
+ recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
+ } catch (Exception e) {
+ SolrException.log(log, "Could not publish that recovery failed", e);
}
break;
}
-
} catch (Exception e) {
SolrException.log(log, "core=" + coreName, e);
}
@@ -536,7 +518,7 @@ public class RecoveryStrategy extends Th
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
log.warn("Recovery was interrupted. core=" + coreName, e);
- retries = INTERRUPTED;
+ close = true;
}
}
Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java?rev=1658237&r1=1658236&r2=1658237&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java Sun Feb 8 18:37:52 2015
@@ -23,6 +23,7 @@ import java.util.concurrent.locks.Reentr
import org.apache.lucene.index.IndexWriter;
import org.apache.solr.cloud.RecoveryStrategy;
+import org.apache.solr.cloud.ActionThrottle;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.core.CoreContainer;
@@ -40,6 +41,10 @@ public final class DefaultSolrCoreState
private final Object recoveryLock = new Object();
+ private final ActionThrottle recoveryThrottle = new ActionThrottle("recovery", 10000);
+
+ private final ActionThrottle leaderThrottle = new ActionThrottle("leader", 5000);
+
// protects pauseWriter and writerFree
private final Object writerPauseLock = new Object();
@@ -313,6 +318,9 @@ public final class DefaultSolrCoreState
// if true, we are recovering after startup and shouldn't have (or be receiving) additional updates (except for local tlog recovery)
boolean recoveringAfterStartup = recoveryStrat == null;
+ recoveryThrottle.minimumWaitBetweenActions();
+ recoveryThrottle.markAttemptingAction();
+
recoveryStrat = new RecoveryStrategy(cc, cd, this);
recoveryStrat.setRecoveringAfterStartup(recoveringAfterStartup);
recoveryStrat.start();
@@ -364,4 +372,10 @@ public final class DefaultSolrCoreState
return commitLock;
}
+ @Override
+ public ActionThrottle getLeaderThrottle() {
+ return leaderThrottle;
+ }
+
+
}
Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/SolrCoreState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/SolrCoreState.java?rev=1658237&r1=1658236&r2=1658237&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/SolrCoreState.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/SolrCoreState.java Sun Feb 8 18:37:52 2015
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.concurrent.locks.Lock;
import org.apache.lucene.index.IndexWriter;
+import org.apache.solr.cloud.ActionThrottle;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.core.DirectoryFactory;
@@ -140,4 +141,9 @@ public abstract class SolrCoreState {
public abstract void close(IndexWriterCloser closer);
+ /**
+ * @return throttle to limit how fast a core attempts to become leader
+ */
+ public abstract ActionThrottle getLeaderThrottle();
+
}