You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by jx...@apache.org on 2013/10/18 17:57:30 UTC

svn commit: r1533526 - in /hbase/branches/0.96/hbase-server/src: main/java/org/apache/hadoop/hbase/master/AssignmentManager.java test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java

Author: jxiang
Date: Fri Oct 18 15:57:30 2013
New Revision: 1533526

URL: http://svn.apache.org/r1533526
Log:
HBASE-9793 Offline a region before it's closed could cause double assignment

Modified:
    hbase/branches/0.96/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java

Modified: hbase/branches/0.96/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1533526&r1=1533525&r2=1533526&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/branches/0.96/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Fri Oct 18 15:57:30 2013
@@ -114,6 +114,10 @@ public class AssignmentManager extends Z
   public static final String ASSIGNMENT_TIMEOUT_MANAGEMENT = "hbase.assignment.timeout.management";
   public static final boolean DEFAULT_ASSIGNMENT_TIMEOUT_MANAGEMENT = false;
 
+  public static final String ALREADY_IN_TRANSITION_WAITTIME
+    = "hbase.assignment.already.intransition.waittime";
+  public static final int DEFAULT_ALREADY_IN_TRANSITION_WAITTIME = 60000; // 1 minute
+
   protected final Server server;
 
   private ServerManager serverManager;
@@ -1666,7 +1670,12 @@ public class AssignmentManager extends Z
     if (state != null) {
       server = state.getServerName();
     }
+    long maxWaitTime = -1;
     for (int i = 1; i <= this.maximumAttempts; i++) {
+      if (this.server.isStopped() || this.server.isAborted()) {
+        LOG.debug("Server stopped/aborted; skipping unassign of " + region);
+        return;
+      }
       // ClosedRegionhandler can remove the server from this.regions
       if (!serverManager.isServerOnline(server)) {
         if (transitionInZK) {
@@ -1685,7 +1694,10 @@ public class AssignmentManager extends Z
           LOG.debug("Sent CLOSE to " + server + " for region " +
             region.getRegionNameAsString());
           if (!transitionInZK && state != null) {
-            regionOffline(region);
+            // Retry to make sure the region is
+            // closed so as to avoid double assignment.
+            unassign(region, state, versionOfClosingNode,
+              dest, transitionInZK,src);
           }
           return;
         }
@@ -1711,11 +1723,34 @@ public class AssignmentManager extends Z
           // RS is already processing this region, only need to update the timestamp
           LOG.debug("update " + state + " the timestamp.");
           state.updateTimestampToNow();
+          if (maxWaitTime < 0) {
+            maxWaitTime = EnvironmentEdgeManager.currentTimeMillis()
+              + this.server.getConfiguration().getLong(ALREADY_IN_TRANSITION_WAITTIME,
+                DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
+          }
+          try {
+            long now = EnvironmentEdgeManager.currentTimeMillis();
+            if (now < maxWaitTime) {
+              LOG.debug("Region is already in transition; "
+                + "waiting up to " + (maxWaitTime - now) + "ms", t);
+              Thread.sleep(100);
+              i--; // reset the try count
+            }
+          } catch (InterruptedException ie) {
+            LOG.warn("Failed to unassign "
+              + region.getRegionNameAsString() + " since interrupted", ie);
+            Thread.currentThread().interrupt();
+            if (!tomActivated) {
+              regionStates.updateRegionState(region, State.FAILED_CLOSE);
+            }
+            return;
+          }
+        } else {
+          LOG.info("Server " + server + " returned " + t + " for "
+            + region.getRegionNameAsString() + ", try=" + i
+            + " of " + this.maximumAttempts, t);
+          // Presume retry or server will expire.
         }
-        LOG.info("Server " + server + " returned " + t + " for "
-          + region.getRegionNameAsString() + ", try=" + i
-          + " of " + this.maximumAttempts, t);
-        // Presume retry or server will expire.
       }
     }
     // Run out of attempts
@@ -1819,10 +1854,15 @@ public class AssignmentManager extends Z
       RegionState currentState = state;
       int versionOfOfflineNode = -1;
       RegionPlan plan = null;
-      long maxRegionServerStartupWaitTime = -1;
+      long maxWaitTime = -1;
       HRegionInfo region = state.getRegion();
       RegionOpeningState regionOpenState;
-      for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) {
+      for (int i = 1; i <= maximumAttempts; i++) {
+        if (server.isStopped() || server.isAborted()) {
+          LOG.info("Skip assigning " + region.getRegionNameAsString()
+            + ", the server is stopped/aborted");
+          return;
+        }
         if (plan == null) { // Get a server for the region at first
           try {
             plan = getRegionPlan(region, forceNewPlan);
@@ -1883,10 +1923,6 @@ public class AssignmentManager extends Z
             continue;
           }
         }
-        if (this.server.isStopped() || this.server.isAborted()) {
-          LOG.debug("Server stopped/aborted; skipping assign of " + region);
-          return;
-        }
         LOG.info("Assigning " + region.getRegionNameAsString() +
             " to " + plan.getDestination().toString());
         // Transition RegionState to PENDING_OPEN
@@ -1940,22 +1976,27 @@ public class AssignmentManager extends Z
 
           if (hold) {
             LOG.warn(assignMsg + ", waiting a little before trying on the same region server " +
-                "try=" + i + " of " + this.maximumAttempts, t);
+              "try=" + i + " of " + this.maximumAttempts, t);
 
-            if (maxRegionServerStartupWaitTime < 0) {
-              maxRegionServerStartupWaitTime = EnvironmentEdgeManager.currentTimeMillis() +
-                  this.server.getConfiguration().
-                      getLong("hbase.regionserver.rpc.startup.waittime", 60000);
+            if (maxWaitTime < 0) {
+              if (t instanceof RegionAlreadyInTransitionException) {
+                maxWaitTime = EnvironmentEdgeManager.currentTimeMillis()
+                  + this.server.getConfiguration().getLong(ALREADY_IN_TRANSITION_WAITTIME,
+                    DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
+              } else {
+                maxWaitTime = this.server.getConfiguration().
+                  getLong("hbase.regionserver.rpc.startup.waittime", 60000);
+              }
             }
             try {
+              needNewPlan = false;
               long now = EnvironmentEdgeManager.currentTimeMillis();
-              if (now < maxRegionServerStartupWaitTime) {
-                LOG.debug("Server is not yet up; waiting up to " +
-                    (maxRegionServerStartupWaitTime - now) + "ms", t);
+              if (now < maxWaitTime) {
+                LOG.debug("Server is not yet up or region is already in transition; "
+                  + "waiting up to " + (maxWaitTime - now) + "ms", t);
                 Thread.sleep(100);
                 i--; // reset the try count
-                needNewPlan = false;
-              } else {
+              } else if (!(t instanceof RegionAlreadyInTransitionException)) {
                 LOG.debug("Server is not up for a while; try a new one", t);
                 needNewPlan = true;
               }

Modified: hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java?rev=1533526&r1=1533525&r2=1533526&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java (original)
+++ hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java Fri Oct 18 15:57:30 2013
@@ -553,6 +553,8 @@ public class TestAssignmentManagerOnClus
       // Unassign it again forcefully so that we can trigger already
       // in transition exception. This test is to make sure this scenario
       // is handled properly.
+      am.server.getConfiguration().setLong(
+        AssignmentManager.ALREADY_IN_TRANSITION_WAITTIME, 1000);
       am.unassign(hri, true);
       RegionState state = am.getRegionStates().getRegionState(hri);
       assertEquals(RegionState.State.FAILED_CLOSE, state.getState());