You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/10/01 08:26:56 UTC

lucene-solr:branch_7x: SOLR-12774: IndexOutOfBoundsException on logging startupVersions

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7x fca89ccc3 -> 38a424ac7


SOLR-12774: IndexOutOfBoundsException on logging startupVersions


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/38a424ac
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/38a424ac
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/38a424ac

Branch: refs/heads/branch_7x
Commit: 38a424ac718cac62d4fec0f198dbc67d11642929
Parents: fca89cc
Author: Cao Manh Dat <da...@apache.org>
Authored: Mon Oct 1 15:25:30 2018 +0700
Committer: Cao Manh Dat <da...@apache.org>
Committed: Mon Oct 1 15:26:28 2018 +0700

----------------------------------------------------------------------
 .../org/apache/solr/cloud/RecoveryStrategy.java | 206 ++++++++++---------
 1 file changed, 105 insertions(+), 101 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/38a424ac/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index aa87801..94e126e 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -321,135 +321,135 @@ public class RecoveryStrategy implements Runnable, Closeable {
 //        + core.getUpdateHandler().getUpdateLog());
 //    return;
 //  }
-  while (!successfulRecovery && !Thread.currentThread().isInterrupted() && !isClosed()) { // don't use interruption or it will close channels though
-    try {
-      CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
-      ZkNodeProps leaderprops = zkStateReader.getLeaderRetry(
-          cloudDesc.getCollectionName(), cloudDesc.getShardId());
-      final String leaderBaseUrl = leaderprops.getStr(ZkStateReader.BASE_URL_PROP);
-      final String leaderCoreName = leaderprops.getStr(ZkStateReader.CORE_NAME_PROP);
+    while (!successfulRecovery && !Thread.currentThread().isInterrupted() && !isClosed()) { // don't use interruption or it will close channels though
+      try {
+        CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
+        ZkNodeProps leaderprops = zkStateReader.getLeaderRetry(
+            cloudDesc.getCollectionName(), cloudDesc.getShardId());
+        final String leaderBaseUrl = leaderprops.getStr(ZkStateReader.BASE_URL_PROP);
+        final String leaderCoreName = leaderprops.getStr(ZkStateReader.CORE_NAME_PROP);
 
-      String leaderUrl = ZkCoreNodeProps.getCoreUrl(leaderBaseUrl, leaderCoreName);
+        String leaderUrl = ZkCoreNodeProps.getCoreUrl(leaderBaseUrl, leaderCoreName);
 
-      String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
+        String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
 
-      boolean isLeader = leaderUrl.equals(ourUrl); //TODO: We can probably delete most of this code if we say this strategy can only be used for pull replicas
-      if (isLeader && !cloudDesc.isLeader()) {
-        throw new SolrException(ErrorCode.SERVER_ERROR, "Cloud state still says we are leader.");
-      }
-      if (cloudDesc.isLeader()) {
-        assert cloudDesc.getReplicaType() != Replica.Type.PULL;
-        // we are now the leader - no one else must have been suitable
-        log.warn("We have not yet recovered - but we are now the leader!");
-        log.info("Finished recovery process.");
-        zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
-        return;
-      }
-      
-      
-      log.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leaderUrl,
-          ourUrl);
-      zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING);
-      
-      if (isClosed()) {
-        log.info("Recovery for core {} has been closed", core.getName());
-        break;
-      }
-      log.info("Starting Replication Recovery.");
+        boolean isLeader = leaderUrl.equals(ourUrl); //TODO: We can probably delete most of this code if we say this strategy can only be used for pull replicas
+        if (isLeader && !cloudDesc.isLeader()) {
+          throw new SolrException(ErrorCode.SERVER_ERROR, "Cloud state still says we are leader.");
+        }
+        if (cloudDesc.isLeader()) {
+          assert cloudDesc.getReplicaType() != Replica.Type.PULL;
+          // we are now the leader - no one else must have been suitable
+          log.warn("We have not yet recovered - but we are now the leader!");
+          log.info("Finished recovery process.");
+          zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
+          return;
+        }
 
-      try {
-        log.info("Stopping background replicate from leader process");
-        zkController.stopReplicationFromLeader(coreName);
-        replicate(zkController.getNodeName(), core, leaderprops);
+
+        log.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leaderUrl,
+            ourUrl);
+        zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING);
 
         if (isClosed()) {
           log.info("Recovery for core {} has been closed", core.getName());
           break;
         }
+        log.info("Starting Replication Recovery.");
 
-        log.info("Replication Recovery was successful.");
-        successfulRecovery = true;
-      } catch (Exception e) {
-        SolrException.log(log, "Error while trying to recover", e);
-      }
-
-    } catch (Exception e) {
-      SolrException.log(log, "Error while trying to recover. core=" + coreName, e);
-    } finally {
-      if (successfulRecovery) {
-        log.info("Restaring background replicate from leader process");
-        zkController.startReplicationFromLeader(coreName, false);
-        log.info("Registering as Active after recovery.");
         try {
-          zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
-        } catch (Exception e) {
-          log.error("Could not publish as ACTIVE after succesful recovery", e);
-          successfulRecovery = false;
-        }
-        
-        if (successfulRecovery) {
-          close = true;
-          recoveryListener.recovered();
-        }
-      }
-    }
+          log.info("Stopping background replicate from leader process");
+          zkController.stopReplicationFromLeader(coreName);
+          replicate(zkController.getNodeName(), core, leaderprops);
 
-    if (!successfulRecovery) {
-      // lets pause for a moment and we need to try again...
-      // TODO: we don't want to retry for some problems?
-      // Or do a fall off retry...
-      try {
+          if (isClosed()) {
+            log.info("Recovery for core {} has been closed", core.getName());
+            break;
+          }
 
-        if (isClosed()) {
-          log.info("Recovery for core {} has been closed", core.getName());
-          break;
+          log.info("Replication Recovery was successful.");
+          successfulRecovery = true;
+        } catch (Exception e) {
+          SolrException.log(log, "Error while trying to recover", e);
         }
-        
-        log.error("Recovery failed - trying again... (" + retries + ")");
-        
-        retries++;
-        if (retries >= maxRetries) {
-          SolrException.log(log, "Recovery failed - max retries exceeded (" + retries + ").");
+
+      } catch (Exception e) {
+        SolrException.log(log, "Error while trying to recover. core=" + coreName, e);
+      } finally {
+        if (successfulRecovery) {
+          log.info("Restaring background replicate from leader process");
+          zkController.startReplicationFromLeader(coreName, false);
+          log.info("Registering as Active after recovery.");
           try {
-            recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
+            zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
           } catch (Exception e) {
-            SolrException.log(log, "Could not publish that recovery failed", e);
+            log.error("Could not publish as ACTIVE after succesful recovery", e);
+            successfulRecovery = false;
+          }
+
+          if (successfulRecovery) {
+            close = true;
+            recoveryListener.recovered();
           }
-          break;
         }
-      } catch (Exception e) {
-        SolrException.log(log, "An error has occurred during recovery", e);
       }
 
-      try {
-        // Wait an exponential interval between retries, start at 5 seconds and work up to a minute.
-        // If we're at attempt >= 4, there's no point computing pow(2, retries) because the result
-        // will always be the minimum of the two (12). Since we sleep at 5 seconds sub-intervals in
-        // order to check if we were closed, 12 is chosen as the maximum loopCount (5s * 12 = 1m).
-        int loopCount = retries < 4 ? (int) Math.min(Math.pow(2, retries), 12) : 12;
-        log.info("Wait [{}] seconds before trying to recover again (attempt={})",
-            TimeUnit.MILLISECONDS.toSeconds(loopCount * startingRecoveryDelayMilliSeconds), retries);
-        for (int i = 0; i < loopCount; i++) {
+      if (!successfulRecovery) {
+        // lets pause for a moment and we need to try again...
+        // TODO: we don't want to retry for some problems?
+        // Or do a fall off retry...
+        try {
+
           if (isClosed()) {
             log.info("Recovery for core {} has been closed", core.getName());
-            break; // check if someone closed us
+            break;
+          }
+
+          log.error("Recovery failed - trying again... (" + retries + ")");
+
+          retries++;
+          if (retries >= maxRetries) {
+            SolrException.log(log, "Recovery failed - max retries exceeded (" + retries + ").");
+            try {
+              recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
+            } catch (Exception e) {
+              SolrException.log(log, "Could not publish that recovery failed", e);
+            }
+            break;
           }
-          Thread.sleep(startingRecoveryDelayMilliSeconds);
+        } catch (Exception e) {
+          SolrException.log(log, "An error has occurred during recovery", e);
+        }
+
+        try {
+          // Wait an exponential interval between retries, start at 5 seconds and work up to a minute.
+          // If we're at attempt >= 4, there's no point computing pow(2, retries) because the result
+          // will always be the minimum of the two (12). Since we sleep at 5 seconds sub-intervals in
+          // order to check if we were closed, 12 is chosen as the maximum loopCount (5s * 12 = 1m).
+          int loopCount = retries < 4 ? (int) Math.min(Math.pow(2, retries), 12) : 12;
+          log.info("Wait [{}] seconds before trying to recover again (attempt={})",
+              TimeUnit.MILLISECONDS.toSeconds(loopCount * startingRecoveryDelayMilliSeconds), retries);
+          for (int i = 0; i < loopCount; i++) {
+            if (isClosed()) {
+              log.info("Recovery for core {} has been closed", core.getName());
+              break; // check if someone closed us
+            }
+            Thread.sleep(startingRecoveryDelayMilliSeconds);
+          }
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
+          log.warn("Recovery was interrupted.", e);
+          close = true;
         }
-      } catch (InterruptedException e) {
-        Thread.currentThread().interrupt();
-        log.warn("Recovery was interrupted.", e);
-        close = true;
       }
-    }
 
+    }
+    // We skip core.seedVersionBuckets(); We don't have a transaction log
+    log.info("Finished recovery process, successful=[{}]", Boolean.toString(successfulRecovery));
   }
-  // We skip core.seedVersionBuckets(); We don't have a transaction log
-  log.info("Finished recovery process, successful=[{}]", Boolean.toString(successfulRecovery));
-}
 
   // TODO: perhaps make this grab a new core each time through the loop to handle core reloads?
-  final public void doSyncOrReplicateRecovery(SolrCore core) throws Exception {
+  public final void doSyncOrReplicateRecovery(SolrCore core) throws Exception {
     boolean successfulRecovery = false;
 
     UpdateLog ulog;
@@ -488,7 +488,11 @@ public class RecoveryStrategy implements Runnable, Closeable {
           log.info("currentVersions size={} range=[{} to {}]", recentVersions.size(), recentVersions.get(0), recentVersions.get(recentVersions.size()-1));
         }
 
-        log.info("startupVersions size={} range=[{} to {}]", startingVersions.size(), startingVersions.get(0), startingVersions.get(startingVersions.size()-1));
+        if (startingVersions.isEmpty()) {
+          log.info("startupVersions is empty");
+        } else {
+          log.info("startupVersions size={} range=[{} to {}]", startingVersions.size(), startingVersions.get(0), startingVersions.get(startingVersions.size()-1));
+        }
       } catch (Exception e) {
         SolrException.log(log, "Error getting recent versions.", e);
         recentVersions = new ArrayList<>(0);