You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2011/03/22 20:36:21 UTC

svn commit: r1084316 - in /hbase/branches/0.90: CHANGES.txt src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java

Author: stack
Date: Tue Mar 22 19:36:21 2011
New Revision: 1084316

URL: http://svn.apache.org/viewvc?rev=1084316&view=rev
Log:
HBASE-3687 Bulk assign on startup should handle a ServerNotRunningException

Modified:
    hbase/branches/0.90/CHANGES.txt
    hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java

Modified: hbase/branches/0.90/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/CHANGES.txt?rev=1084316&r1=1084315&r2=1084316&view=diff
==============================================================================
--- hbase/branches/0.90/CHANGES.txt (original)
+++ hbase/branches/0.90/CHANGES.txt Tue Mar 22 19:36:21 2011
@@ -47,6 +47,7 @@ Release 0.90.2 - Unreleased
    HBASE-3621  The timeout handler in AssignmentManager does an RPC while
                holding lock on RIT; a big no-no (Ted Yu via Stack)
    HBASE-3575  Update rename table script
+   HBASE-3687  Bulk assign on startup should handle a ServerNotRunningException
 
   IMPROVEMENTS
    HBASE-3542  MultiGet methods in Thrift

Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1084316&r1=1084315&r2=1084316&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Tue Mar 22 19:36:21 2011
@@ -773,9 +773,27 @@ public class AssignmentManager extends Z
     // Move on to open regions.
     try {
       // Send OPEN RPC. This can fail if the server on other end is is not up.
-      this.serverManager.sendRegionOpen(destination, regions);
+      // If we fail, fail the startup by aborting the server.  There is one
+      // exception we will tolerate: ServerNotRunningException.  This is thrown
+      // between report of regionserver being up and 
+      long maxWaitTime = System.currentTimeMillis() +
+        this.master.getConfiguration().getLong("hbase.regionserver.rpc.startup.waittime", 60000);
+      while (!this.master.isStopped()) {
+        try {
+          this.serverManager.sendRegionOpen(destination, regions);
+        } catch (org.apache.hadoop.hbase.ipc.ServerNotRunningException e) {
+          // This is the one exception to retry.  For all else we should just fail
+          // the startup.
+          long now = System.currentTimeMillis();
+          if (now > maxWaitTime) throw e;
+          LOG.debug("Server is not yet up; waiting up to " +
+              (maxWaitTime - now) + "ms", e);
+          Thread.sleep(1000);
+        }
+      }
     } catch (Throwable t) {
-      this.master.abort("Failed assignment of regions to " + destination, t);
+      this.master.abort("Failed assignment of regions to " + destination +
+        "; bulk assign FAILED", t);
       return;
     }
     LOG.debug("Bulk assigning done for " + destination.getServerName());