You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2014/03/21 16:20:32 UTC

svn commit: r1579956 - in /lucene/dev/branches/branch_4x: ./ solr/ solr/CHANGES.txt solr/core/ solr/core/src/java/org/apache/solr/cloud/ZkController.java solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java

Author: shalin
Date: Fri Mar 21 15:20:32 2014
New Revision: 1579956

URL: http://svn.apache.org/r1579956
Log:
SOLR-5860: Use leaderConflictResolveWait in WaitForState during recovery/startup, improve logging and force refresh cluster state every 15 seconds

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/solr/core/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ZkController.java
    lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java

Modified: lucene/dev/branches/branch_4x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/CHANGES.txt?rev=1579956&r1=1579955&r2=1579956&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/solr/CHANGES.txt Fri Mar 21 15:20:32 2014
@@ -86,6 +86,10 @@ New Features
 * SOLR-5865: Provide a MiniSolrCloudCluster to enable easier testing.
  (Greg Chanan via Mark Miller)
 
+* SOLR-5860: Use leaderConflictResolveWait in WaitForState during recovery/startup,
+  improve logging and force refresh cluster state every 15 seconds.
+  (Timothy Potter via shalin)
+
 Bug Fixes
 ----------------------
 

Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ZkController.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ZkController.java?rev=1579956&r1=1579955&r2=1579956&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ZkController.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ZkController.java Fri Mar 21 15:20:32 2014
@@ -298,6 +298,10 @@ public final class ZkController {
   public int getLeaderVoteWait() {
     return leaderVoteWait;
   }
+  
+  public int getLeaderConflictResolveWait() {
+    return leaderConflictResolveWait;
+  }
 
   public void forceOverSeer(){
     try {

Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java?rev=1579956&r1=1579955&r2=1579956&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java Fri Mar 21 15:20:32 2014
@@ -34,6 +34,7 @@ import org.apache.solr.common.SolrExcept
 import org.apache.solr.common.cloud.ClusterState;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.DocRouter;
+import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
@@ -968,6 +969,7 @@ public class CoreAdminHandler extends Re
     log.info("Going to wait for coreNodeName: " + coreNodeName + ", state: " + waitForState
         + ", checkLive: " + checkLive + ", onlyIfLeader: " + onlyIfLeader);
 
+    int maxTries = 0; 
     String state = null;
     boolean live = false;
     int retry = 0;
@@ -991,10 +993,25 @@ public class CoreAdminHandler extends Re
           CloudDescriptor cloudDescriptor = core.getCoreDescriptor()
               .getCloudDescriptor();
           
-          if (retry == 15 || retry == 60) {
+          if (retry % 15 == 0) {
+            if (retry > 0 && log.isInfoEnabled())
+              log.info("After " + retry + " seconds, core " + cname + " (" +
+                  cloudDescriptor.getShardId() + " of " +
+                  cloudDescriptor.getCollectionName() + ") still does not have state: " +
+                  waitForState + "; forcing ClusterState update from ZooKeeper");
+            
             // force a cluster state update
             coreContainer.getZkController().getZkStateReader().updateClusterState(true);
           }
+
+          if (maxTries == 0) {
+            // wait long enough for the leader conflict to work itself out plus a little extra
+            int conflictWaitMs = coreContainer.getZkController().getLeaderConflictResolveWait();
+            maxTries = (int) Math.round(conflictWaitMs / 1000) + 3;
+            log.info("Will wait a max of " + maxTries + " seconds to see " + cname + " (" +
+                cloudDescriptor.getShardId() + " of " +
+                cloudDescriptor.getCollectionName() + ") have state: " + waitForState);
+          }
           
           ClusterState clusterState = coreContainer.getZkController()
               .getClusterState();
@@ -1023,13 +1040,28 @@ public class CoreAdminHandler extends Re
             }
           }
         }
-        
-        if (retry++ == 120) {
+
+        if (retry++ == maxTries) {
+          String collection = null;
+          String leaderInfo = null;
+          String shardId = null;
+          try {
+            CloudDescriptor cloudDescriptor =
+                core.getCoreDescriptor().getCloudDescriptor();
+            collection = cloudDescriptor.getCollectionName();
+            shardId = cloudDescriptor.getShardId();
+            leaderInfo = coreContainer.getZkController().
+                getZkStateReader().getLeaderUrl(collection, shardId, 0);
+          } catch (Exception exc) {
+            leaderInfo = "Not available due to: " + exc;
+          }
+
           throw new SolrException(ErrorCode.BAD_REQUEST,
               "I was asked to wait on state " + waitForState + " for "
-                  + nodeName
+                  + shardId + " in " + collection + " on " + nodeName
                   + " but I still do not see the requested state. I see state: "
-                  + state + " live:" + live);
+                  + state + " live:" + live + " leader from ZK: " + leaderInfo
+          );
         }
         
         if (coreContainer.isShutDown()) {
@@ -1040,7 +1072,6 @@ public class CoreAdminHandler extends Re
         // solrcloud_debug
         if (log.isDebugEnabled()) {
           try {
-            ;
             LocalSolrQueryRequest r = new LocalSolrQueryRequest(core,
                 new ModifiableSolrParams());
             CommitUpdateCommand commitCmd = new CommitUpdateCommand(r, false);