You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2015/10/02 16:40:15 UTC
svn commit: r1706423 - in /lucene/dev/trunk/solr: CHANGES.txt
core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
Author: markrmiller
Date: Fri Oct 2 14:40:13 2015
New Revision: 1706423
URL: http://svn.apache.org/viewvc?rev=1706423&view=rev
Log:
SOLR-8085: Fix a variety of issues that can result in replicas getting out of sync.
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1706423&r1=1706422&r2=1706423&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Fri Oct 2 14:40:13 2015
@@ -221,6 +221,8 @@ Bug Fixes
* SOLR-8095: Allow disabling HDFS Locality Metrics and disable by default as it may have performance
implications on rapidly changing indexes. (Mike Drob via Mark Miller)
+
+* SOLR-8085: Fix a variety of issues that can result in replicas getting out of sync. (yonik, Mark Miller)
Optimizations
----------------------
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java?rev=1706423&r1=1706422&r2=1706423&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java Fri Oct 2 14:40:13 2015
@@ -54,8 +54,6 @@ import org.apache.solr.logging.MDCLoggin
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestHandler;
-import org.apache.solr.request.SolrRequestInfo;
-import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.update.CommitUpdateCommand;
import org.apache.solr.update.PeerSync;
@@ -340,6 +338,10 @@ public class RecoveryStrategy extends Th
return;
}
+ log.info("Begin buffering updates. core=" + coreName);
+ ulog.bufferUpdates();
+ replayed = false;
+
log.info("Publishing state of core " + core.getName() + " as recovering, leader is " + leaderUrl + " and I am "
+ ourUrl);
zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING);
@@ -390,7 +392,7 @@ public class RecoveryStrategy extends Th
new ModifiableSolrParams());
// force open a new searcher
core.getUpdateHandler().commit(new CommitUpdateCommand(req, false));
- log.info("PeerSync Recovery was successful - registering as Active.");
+ log.info("PeerSync stage of recovery was successful.");
// solrcloud_debug
if (log.isDebugEnabled()) {
@@ -410,11 +412,12 @@ public class RecoveryStrategy extends Th
log.debug("Error in solrcloud_debug block", e);
}
}
-
- // sync success - register as active and return
- zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
+ log.info("Replaying updates buffered during PeerSync.");
+ replay(core);
+ replayed = true;
+
+ // sync success
successfulRecovery = true;
- close = true;
return;
}
@@ -427,10 +430,6 @@ public class RecoveryStrategy extends Th
}
log.info("Starting Replication Recovery.");
-
- log.info("Begin buffering updates.");
- ulog.bufferUpdates();
- replayed = false;
try {
@@ -449,31 +448,40 @@ public class RecoveryStrategy extends Th
break;
}
- log.info("Replication Recovery was successful - registering as Active.");
- // if there are pending recovery requests, don't advert as active
- zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
- close = true;
+ log.info("Replication Recovery was successful.");
successfulRecovery = true;
- recoveryListener.recovered();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
log.warn("Recovery was interrupted", e);
close = true;
} catch (Exception e) {
SolrException.log(log, "Error while trying to recover", e);
- } finally {
- if (!replayed) {
- try {
- ulog.dropBufferedUpdates();
- } catch (Exception e) {
- SolrException.log(log, "", e);
- }
- }
-
}
} catch (Exception e) {
- SolrException.log(log, "Error while trying to recover.", e);
+ SolrException.log(log, "Error while trying to recover. core=" + coreName, e);
+ } finally {
+ if (!replayed) {
+ try {
+ ulog.dropBufferedUpdates();
+ } catch (Exception e) {
+ SolrException.log(log, "", e);
+ }
+ }
+ if (successfulRecovery) {
+ log.info("Registering as Active after recovery.");
+ try {
+ zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
+ } catch (Exception e) {
+ log.error("Could not publish as ACTIVE after succesful recovery", e);
+ successfulRecovery = false;
+ }
+
+ if (successfulRecovery) {
+ close = true;
+ recoveryListener.recovered();
+ }
+ }
}
if (!successfulRecovery) {
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java?rev=1706423&r1=1706422&r2=1706423&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java Fri Oct 2 14:40:13 2015
@@ -41,7 +41,7 @@ public final class DefaultSolrCoreState
public static Logger log = LoggerFactory.getLogger(DefaultSolrCoreState.class);
private final boolean SKIP_AUTO_RECOVERY = Boolean.getBoolean("solrcloud.skip.autorecovery");
-
+
private final Object recoveryLock = new Object();
private final ActionThrottle recoveryThrottle = new ActionThrottle("recovery", 10000);
@@ -59,6 +59,10 @@ public final class DefaultSolrCoreState
private RecoveryStrategy recoveryStrat;
private volatile boolean lastReplicationSuccess = true;
+ // will we attempt recovery as if we just started up (i.e. use starting versions rather than recent versions for peersync
+ // so we aren't looking at update versions that have started buffering since we came up.
+ private volatile boolean recoveringAfterStartup = true;
+
private RefCounted<IndexWriter> refCntWriter;
protected final ReentrantLock commitLock = new ReentrantLock();
@@ -272,10 +276,6 @@ public final class DefaultSolrCoreState
if (closed) return;
}
- // if true, we are recovering after startup and shouldn't have (or be receiving) additional updates (except for
- // local tlog recovery)
- boolean recoveringAfterStartup = recoveryStrat == null;
-
recoveryThrottle.minimumWaitBetweenActions();
recoveryThrottle.markAttemptingAction();
@@ -310,11 +310,14 @@ public final class DefaultSolrCoreState
}
}
+ /** called from recoveryStrat on a successful recovery */
@Override
public void recovered() {
+ recoveringAfterStartup = false; // once we have successfully recovered, we no longer need to act as if we are recovering after startup
recoveryRunning = false;
}
+ /** called from recoveryStrat on a failed recovery */
@Override
public void failed() {
recoveryRunning = false;