You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2015/10/02 16:40:15 UTC

svn commit: r1706423 - in /lucene/dev/trunk/solr: CHANGES.txt core/src/java/org/apache/solr/cloud/RecoveryStrategy.java core/src/java/org/apache/solr/update/DefaultSolrCoreState.java

Author: markrmiller
Date: Fri Oct  2 14:40:13 2015
New Revision: 1706423

URL: http://svn.apache.org/viewvc?rev=1706423&view=rev
Log:
SOLR-8085: Fix a variety of issues that can result in replicas getting out of sync.

Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1706423&r1=1706422&r2=1706423&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Fri Oct  2 14:40:13 2015
@@ -221,6 +221,8 @@ Bug Fixes
 
 * SOLR-8095: Allow disabling HDFS Locality Metrics and disable by default as it may have performance
   implications on rapidly changing indexes. (Mike Drob via Mark Miller)
+  
+* SOLR-8085: Fix a variety of issues that can result in replicas getting out of sync. (yonik, Mark Miller)
 
 Optimizations
 ----------------------

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java?rev=1706423&r1=1706422&r2=1706423&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java Fri Oct  2 14:40:13 2015
@@ -54,8 +54,6 @@ import org.apache.solr.logging.MDCLoggin
 import org.apache.solr.request.LocalSolrQueryRequest;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.request.SolrRequestHandler;
-import org.apache.solr.request.SolrRequestInfo;
-import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.update.CommitUpdateCommand;
 import org.apache.solr.update.PeerSync;
@@ -340,6 +338,10 @@ public class RecoveryStrategy extends Th
           return;
         }
         
+        log.info("Begin buffering updates. core=" + coreName);
+        ulog.bufferUpdates();
+        replayed = false;
+        
         log.info("Publishing state of core " + core.getName() + " as recovering, leader is " + leaderUrl + " and I am "
             + ourUrl);
         zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING);
@@ -390,7 +392,7 @@ public class RecoveryStrategy extends Th
                 new ModifiableSolrParams());
             // force open a new searcher
             core.getUpdateHandler().commit(new CommitUpdateCommand(req, false));
-            log.info("PeerSync Recovery was successful - registering as Active.");
+            log.info("PeerSync stage of recovery was successful.");
 
             // solrcloud_debug
             if (log.isDebugEnabled()) {
@@ -410,11 +412,12 @@ public class RecoveryStrategy extends Th
                 log.debug("Error in solrcloud_debug block", e);
               }
             }
-
-            // sync success - register as active and return
-            zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
+            log.info("Replaying updates buffered during PeerSync.");
+            replay(core);
+            replayed = true;
+            
+            // sync success
             successfulRecovery = true;
-            close = true;
             return;
           }
 
@@ -427,10 +430,6 @@ public class RecoveryStrategy extends Th
         }
         
         log.info("Starting Replication Recovery.");
-        
-        log.info("Begin buffering updates.");
-        ulog.bufferUpdates();
-        replayed = false;
 
         try {
 
@@ -449,31 +448,40 @@ public class RecoveryStrategy extends Th
             break;
           }
 
-          log.info("Replication Recovery was successful - registering as Active.");
-          // if there are pending recovery requests, don't advert as active
-          zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
-          close = true;
+          log.info("Replication Recovery was successful.");
           successfulRecovery = true;
-          recoveryListener.recovered();
         } catch (InterruptedException e) {
           Thread.currentThread().interrupt();
           log.warn("Recovery was interrupted", e);
           close = true;
         } catch (Exception e) {
           SolrException.log(log, "Error while trying to recover", e);
-        } finally {
-          if (!replayed) {
-            try {
-              ulog.dropBufferedUpdates();
-            } catch (Exception e) {
-              SolrException.log(log, "", e);
-            }
-          }
-
         }
 
       } catch (Exception e) {
-        SolrException.log(log, "Error while trying to recover.", e);
+        SolrException.log(log, "Error while trying to recover. core=" + coreName, e);
+      } finally {
+        if (!replayed) {
+          try {
+            ulog.dropBufferedUpdates();
+          } catch (Exception e) {
+            SolrException.log(log, "", e);
+          }
+        }
+        if (successfulRecovery) {
+          log.info("Registering as Active after recovery.");
+          try {
+            zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
+          } catch (Exception e) {
+            log.error("Could not publish as ACTIVE after succesful recovery", e);
+            successfulRecovery = false;
+          }
+          
+          if (successfulRecovery) {
+            close = true;
+            recoveryListener.recovered();
+          }
+        }
       }
 
       if (!successfulRecovery) {

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java?rev=1706423&r1=1706422&r2=1706423&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java Fri Oct  2 14:40:13 2015
@@ -41,7 +41,7 @@ public final class DefaultSolrCoreState
   public static Logger log = LoggerFactory.getLogger(DefaultSolrCoreState.class);
   
   private final boolean SKIP_AUTO_RECOVERY = Boolean.getBoolean("solrcloud.skip.autorecovery");
-  
+
   private final Object recoveryLock = new Object();
   
   private final ActionThrottle recoveryThrottle = new ActionThrottle("recovery", 10000);
@@ -59,6 +59,10 @@ public final class DefaultSolrCoreState
   private RecoveryStrategy recoveryStrat;
   private volatile boolean lastReplicationSuccess = true;
 
+  // will we attempt recovery as if we just started up (i.e. use starting versions rather than recent versions for peersync
+  // so we aren't looking at update versions that have started buffering since we came up.
+  private volatile boolean recoveringAfterStartup = true;
+
   private RefCounted<IndexWriter> refCntWriter;
   
   protected final ReentrantLock commitLock = new ReentrantLock();
@@ -272,10 +276,6 @@ public final class DefaultSolrCoreState
           if (closed) return;
         }
         
-        // if true, we are recovering after startup and shouldn't have (or be receiving) additional updates (except for
-        // local tlog recovery)
-        boolean recoveringAfterStartup = recoveryStrat == null;
-        
         recoveryThrottle.minimumWaitBetweenActions();
         recoveryThrottle.markAttemptingAction();
         
@@ -310,11 +310,14 @@ public final class DefaultSolrCoreState
     }
   }
 
+  /** called from recoveryStrat on a successful recovery */
   @Override
   public void recovered() {
+    recoveringAfterStartup = false;  // once we have successfully recovered, we no longer need to act as if we are recovering after startup
     recoveryRunning = false;
   }
 
+  /** called from recoveryStrat on a failed recovery */
   @Override
   public void failed() {
     recoveryRunning = false;