You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by jx...@apache.org on 2014/10/08 05:40:40 UTC

git commit: HBASE-12196 SSH should retry in case failed to assign regions

Repository: hbase
Updated Branches:
  refs/heads/master cab081932 -> f2fc311b1


HBASE-12196 SSH should retry in case failed to assign regions


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/f2fc311b
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/f2fc311b
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/f2fc311b

Branch: refs/heads/master
Commit: f2fc311b19f19031e24547a7e4750a17115ac383
Parents: cab0819
Author: Jimmy Xiang <jx...@cloudera.com>
Authored: Tue Oct 7 15:07:36 2014 -0700
Committer: Jimmy Xiang <jx...@cloudera.com>
Committed: Tue Oct 7 20:23:32 2014 -0700

----------------------------------------------------------------------
 .../master/handler/ServerShutdownHandler.java   |  6 ++
 .../master/TestAssignmentManagerOnCluster.java  | 73 ++++++++++++++++++++
 2 files changed, 79 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/f2fc311b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
index 1691c9d..c443968 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
@@ -273,6 +273,12 @@ public class ServerShutdownHandler extends EventHandler {
       } catch (InterruptedException ie) {
         LOG.error("Caught " + ie + " during round-robin assignment");
         throw (InterruptedIOException)new InterruptedIOException().initCause(ie);
+      } catch (IOException ioe) {
+        LOG.info("Caught " + ioe + " during region assignment, will retry");
+        // Only do HLog splitting if shouldSplitHlog and in DLR mode
+        serverManager.processDeadServer(serverName,
+          this.shouldSplitHlog && distributedLogReplay);
+        return;
       }
 
       if (this.shouldSplitHlog && distributedLogReplay) {

http://git-wip-us.apache.org/repos/asf/hbase/blob/f2fc311b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
index 4d24268..27c7073 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
@@ -28,8 +28,10 @@ import static org.junit.Assert.fail;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
@@ -79,6 +81,7 @@ import org.junit.experimental.categories.Category;
 /**
  * This tests AssignmentManager with a testing cluster.
  */
+@SuppressWarnings("deprecation")
 @Category({MasterTests.class, MediumTests.class})
 public class TestAssignmentManagerOnCluster {
   private final static byte[] FAMILY = Bytes.toBytes("FAMILY");
@@ -831,6 +834,58 @@ public class TestAssignmentManagerOnCluster {
   }
 
   /**
+   * Test SSH waiting for extra region server for assignment
+   */
+  @Test (timeout=300000)
+  public void testSSHWaitForServerToAssignRegion() throws Exception {
+    TableName table = TableName.valueOf("testSSHWaitForServerToAssignRegion");
+    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+    boolean startAServer = false;
+    try {
+      HTableDescriptor desc = new HTableDescriptor(table);
+      desc.addFamily(new HColumnDescriptor(FAMILY));
+      admin.createTable(desc);
+
+      HMaster master = cluster.getMaster();
+      final ServerManager serverManager = master.getServerManager();
+      MyLoadBalancer.countRegionServers = Integer.valueOf(
+        serverManager.countOfRegionServers());
+      HRegionServer rs = TEST_UTIL.getRSForFirstRegionInTable(table);
+      assertNotNull("First region should be assigned", rs);
+      final ServerName serverName = rs.getServerName();
+      // Wait till SSH tried to assign regions a several times
+      int counter = MyLoadBalancer.counter.get() + 5;
+      cluster.killRegionServer(serverName);
+      startAServer = true;
+      cluster.waitForRegionServerToStop(serverName, -1);
+      while (counter > MyLoadBalancer.counter.get()) {
+        Thread.sleep(1000);
+      }
+      cluster.startRegionServer();
+      startAServer = false;
+      // Wait till the dead server is processed by SSH
+      TEST_UTIL.waitFor(120000, 1000, new Waiter.Predicate<Exception>() {
+        @Override
+        public boolean evaluate() throws Exception {
+          return serverManager.isServerDead(serverName)
+            && !serverManager.areDeadServersInProgress();
+        }
+      });
+      TEST_UTIL.waitUntilNoRegionsInTransition(300000);
+
+      rs = TEST_UTIL.getRSForFirstRegionInTable(table);
+      assertTrue("First region should be re-assigned to a different server",
+        rs != null && !serverName.equals(rs.getServerName()));
+    } finally {
+      MyLoadBalancer.countRegionServers = null;
+      TEST_UTIL.deleteTable(table);
+      if (startAServer) {
+        cluster.startRegionServer();
+      }
+    }
+  }
+
+  /**
    * Test force unassign/assign a region of a disabled table
    */
   @Test (timeout=60000)
@@ -1121,6 +1176,9 @@ public class TestAssignmentManagerOnCluster {
     // For this region, if specified, always assign to nowhere
     static volatile String controledRegion = null;
 
+    static volatile Integer countRegionServers = null;
+    static AtomicInteger counter = new AtomicInteger(0);
+
     @Override
     public ServerName randomAssignment(HRegionInfo regionInfo,
         List<ServerName> servers) {
@@ -1129,6 +1187,21 @@ public class TestAssignmentManagerOnCluster {
       }
       return super.randomAssignment(regionInfo, servers);
     }
+
+    @Override
+    public Map<ServerName, List<HRegionInfo>> roundRobinAssignment(
+        List<HRegionInfo> regions, List<ServerName> servers) {
+      if (countRegionServers != null && services != null) {
+        int regionServers = services.getServerManager().countOfRegionServers();
+        if (regionServers < countRegionServers.intValue()) {
+          // Let's wait till more region servers join in.
+          // Before that, fail region assignments.
+          counter.incrementAndGet();
+          return null;
+        }
+      }
+      return super.roundRobinAssignment(regions, servers);
+    }
   }
 
   public static class MyMaster extends HMaster {