You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by te...@apache.org on 2013/04/04 22:21:17 UTC

svn commit: r1464724 - in /hbase/branches/0.94/src: main/java/org/apache/hadoop/hbase/master/ main/java/org/apache/hadoop/hbase/master/handler/ test/java/org/apache/hadoop/hbase/master/ test/java/org/apache/hadoop/hbase/regionserver/

Author: tedyu
Date: Thu Apr  4 20:21:17 2013
New Revision: 1464724

URL: http://svn.apache.org/r1464724
Log:
HBASE-8127 Region of a disabling or disabled table could be stuck in transition state when RS dies during Master initialization (Rajeshbabu)


Modified:
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
    hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java
    hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1464724&r1=1464723&r2=1464724&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Thu Apr  4 20:21:17 2013
@@ -2836,6 +2836,12 @@ public class AssignmentManager extends Z
         // see HBASE-5916
         if (actualDeadServers.contains(deadServer.getKey())) {
           for (Pair<HRegionInfo, Result> deadRegion : deadServer.getValue()) {
+            HRegionInfo hri = deadRegion.getFirst();
+            // Delete znode of region in transition if table is disabled or disabling. If a region
+            // server went down during master initialization then SSH cannot handle the regions of
+            // partially disabled tables because in memory region state information may not be
+            // available with master.
+            deleteNodeAndOfflineRegion(hri);
             nodes.remove(deadRegion.getFirst().getEncodedName());
           }
           continue;
@@ -2884,6 +2890,22 @@ public class AssignmentManager extends Z
     }
   }
 
+  /**
+   * Delete znode of region in transition if table is disabling/disabled and offline the region.
+   * @param hri region to offline.
+   */
+  public void deleteNodeAndOfflineRegion(HRegionInfo hri) {
+    if (zkTable.isDisablingOrDisabledTable(hri.getTableNameAsString())) {
+      try {
+        // If table is partially disabled then delete znode if exists in any state.
+        ZKAssign.deleteNodeFailSilent(this.master.getZooKeeper(), hri);
+      } catch (KeeperException ke) {
+        this.master.abort("Unexpected ZK exception deleting unassigned node " + hri, ke);
+      }
+      regionOffline(hri);
+    }
+  }
+
   /*
    * Presumes caller has taken care of necessary locking modifying servers Map.
    * @param hsi
@@ -3580,7 +3602,7 @@ public class AssignmentManager extends Z
       out.writeLong(stamp.get());
     }
   }
-
+  
   public void stop() {
     this.timeoutMonitor.interrupt();
     this.timerUpdater.interrupt();

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java?rev=1464724&r1=1464723&r2=1464724&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java Thu Apr  4 20:21:17 2013
@@ -288,31 +288,29 @@ public class ServerShutdownHandler exten
       // If the table was partially disabled and the RS went down, we should clear the RIT
       // and remove the node for the region. The rit that we use may be stale in case the table
       // was in DISABLING state but though we did assign we will not be clearing the znode in
-      // CLOSING state. Doing this will have no harm. See HBASE-5927
-      toAssign = checkForDisablingOrDisabledTables(ritsGoingToServer, toAssign, rit, assignmentManager);
+      // CLOSING state. Doing this will have no harm. The rit can be null if region server went
+      // down during master startup. In that case If any znodes' exists for partially disabled 
+      // table regions deleting them during startup only. See HBASE-8127. 
+      toAssign =
+          checkForDisablingOrDisabledTables(ritsGoingToServer, toAssign, rit, e.getKey(),
+            assignmentManager);
     }
     return toAssign;
   }
 
   private List<HRegionInfo> checkForDisablingOrDisabledTables(Set<HRegionInfo> regionsFromRIT,
-      List<HRegionInfo> toAssign, RegionState rit, AssignmentManager assignmentManager) {
-    if (rit == null) {
-      return toAssign;
-    }
-    if (!rit.isClosing() && !rit.isPendingClose()) {
-      return toAssign;
-    }
-    if (!assignmentManager.getZKTable().isDisablingOrDisabledTable(
-        rit.getRegion().getTableNameAsString())) {
-      return toAssign;
-    }
-    HRegionInfo hri = rit.getRegion();
-    AssignmentManager am = assignmentManager;
-    am.deleteClosingOrClosedNode(hri);
-    am.regionOffline(hri);
-    // To avoid region assignment if table is in disabling or disabled state.
-    toAssign.remove(hri);
-    regionsFromRIT.remove(hri);
+      List<HRegionInfo> toAssign, RegionState rit, HRegionInfo hri,
+      AssignmentManager assignmentManager) {
+    boolean disabled =
+        assignmentManager.getZKTable().isDisablingOrDisabledTable(hri.getTableNameAsString());
+    if (disabled) {
+      // To avoid region assignment if table is in disabling or disabled state.
+      toAssign.remove(hri);
+      regionsFromRIT.remove(hri);
+    }
+    if (rit != null && disabled) {
+      assignmentManager.deleteNodeAndOfflineRegion(hri);
+    }
     return toAssign;
   }
 

Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java?rev=1464724&r1=1464723&r2=1464724&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java Thu Apr  4 20:21:17 2013
@@ -431,10 +431,18 @@ public class TestAssignmentManager {
   @Test
   public void testSSHWhenDisableTableInProgress()
       throws KeeperException, IOException {
-    testCaseWithPartiallyDisabledState(TableState.DISABLING);
-    testCaseWithPartiallyDisabledState(TableState.DISABLED);
+    testCaseWithPartiallyDisabledState(TableState.DISABLING, false);
+    testCaseWithPartiallyDisabledState(TableState.DISABLED, false);
   }
 
+  @Test
+  public void testSSHWhenDisablingTableRegionsInOpeningState()
+      throws KeeperException, IOException {
+    testCaseWithPartiallyDisabledState(TableState.DISABLING, true);
+    testCaseWithPartiallyDisabledState(TableState.DISABLED, true);
+  }
+
+  
   /**
    * To test if the split region is removed from RIT if the region was in SPLITTING state
    * but the RS has actually completed the splitting in META but went down. See HBASE-6070
@@ -500,7 +508,8 @@ public class TestAssignmentManager {
     }
   }
 
-  private void testCaseWithPartiallyDisabledState(TableState state) throws KeeperException, IOException, NodeExistsException {
+  private void testCaseWithPartiallyDisabledState(TableState state, boolean opening)
+      throws KeeperException, IOException, NodeExistsException {
     // Create and startup an executor. This is used by AssignmentManager
     // handling zk callbacks.
     ExecutorService executor = startupMasterExecutor("testSSHWhenDisableTableInProgress");
@@ -511,20 +520,32 @@ public class TestAssignmentManager {
     // Create an AM.
     AssignmentManager am = new AssignmentManager(this.server, this.serverManager, ct, balancer,
         executor);
-    // adding region to regions and servers maps.
-    am.regionOnline(REGIONINFO, SERVERNAME_A);
-    // adding region in pending close.
-    am.regionsInTransition.put(REGIONINFO.getEncodedName(), new RegionState(REGIONINFO,
-        State.PENDING_CLOSE, System.currentTimeMillis(), SERVERNAME_A));
-
+    if (opening) {
+      am.regionsInTransition.put(REGIONINFO.getEncodedName(), new RegionState(REGIONINFO,
+          State.OPENING, System.currentTimeMillis(), SERVERNAME_A));
+    } else {
+      // adding region to regions and servers maps.
+      am.regionOnline(REGIONINFO, SERVERNAME_A);
+      // adding region in pending close.
+      am.regionsInTransition.put(REGIONINFO.getEncodedName(), new RegionState(REGIONINFO,
+          State.PENDING_CLOSE, System.currentTimeMillis(), SERVERNAME_A));
+    }
     if (state == TableState.DISABLING) {
       am.getZKTable().setDisablingTable(REGIONINFO.getTableNameAsString());
     } else {
       am.getZKTable().setDisabledTable(REGIONINFO.getTableNameAsString());
     }
+    RegionTransitionData data = null;
+    if (opening) {
+      data =
+          new RegionTransitionData(EventType.RS_ZK_REGION_OPENING, REGIONINFO.getRegionName(),
+              SERVERNAME_A);
 
-    RegionTransitionData data = new RegionTransitionData(EventType.M_ZK_REGION_CLOSING,
-        REGIONINFO.getRegionName(), SERVERNAME_A);
+    } else {
+      data =
+          new RegionTransitionData(EventType.M_ZK_REGION_CLOSING, REGIONINFO.getRegionName(),
+              SERVERNAME_A);
+    }
     String node = ZKAssign.getNodeName(this.watcher, REGIONINFO.getEncodedName());
     // create znode in M_ZK_REGION_CLOSING state.
     ZKUtil.createAndWatch(this.watcher, node, data.getBytes());
@@ -534,12 +555,8 @@ public class TestAssignmentManager {
       // check znode deleted or not.
       // In both cases the znode should be deleted.
       assertTrue("The znode should be deleted.",ZKUtil.checkExists(this.watcher, node) == -1);
-      // check whether in rit or not.  In the DISABLING case also the below assert will be true
-      // but the piece of code added for HBASE-5927 will not do that.
-      if (state == TableState.DISABLED) {
-        assertTrue("Region state of region in pending close should be removed from rit.",
-            am.regionsInTransition.isEmpty());
-      }
+      assertTrue("Region state of region in pending close should be removed from rit.",
+        am.regionsInTransition.isEmpty());
     } finally {
       executor.shutdown();
       am.shutdown();

Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java?rev=1464724&r1=1464723&r2=1464724&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java Thu Apr  4 20:21:17 2013
@@ -25,7 +25,6 @@ import static org.junit.Assert.assertTru
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Set;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -43,6 +42,7 @@ import org.apache.hadoop.hbase.client.HT
 import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.ResultScanner;
 import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.master.AssignmentManager;
 import org.apache.hadoop.hbase.master.HMaster;
 import org.apache.hadoop.hbase.master.MasterFileSystem;
 import org.apache.hadoop.hbase.master.ServerManager;
@@ -51,6 +51,7 @@ import org.apache.hadoop.hbase.util.Byte
 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
 import org.apache.hadoop.hbase.util.Threads;
 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
+import org.apache.hadoop.hbase.zookeeper.ZKTable;
 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
 import org.apache.zookeeper.KeeperException;
 import org.junit.AfterClass;
@@ -64,7 +65,7 @@ public class TestRSKilledWhenMasterIniti
 
   private static final HBaseTestingUtility TESTUTIL = new HBaseTestingUtility();
   private static final int NUM_MASTERS = 1;
-  private static final int NUM_RS = 4;
+  private static final int NUM_RS = 5;
 
   @BeforeClass
   public static void setUpBeforeClass() throws Exception {
@@ -72,7 +73,7 @@ public class TestRSKilledWhenMasterIniti
     Configuration conf = TESTUTIL.getConfiguration();
     conf.setClass(HConstants.MASTER_IMPL, TestingMaster.class, HMaster.class);
     conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 3);
-    conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 4);
+    conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 5);
 
     // Start up the cluster.
     TESTUTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
@@ -234,6 +235,61 @@ public class TestRSKilledWhenMasterIniti
     assertEquals(3, count);
   }
 
+  @Test (timeout=180000)
+  public void testMasterFailoverWhenDisablingTableRegionsInRITOnDeadRS() throws Exception {
+    MiniHBaseCluster cluster = TESTUTIL.getHBaseCluster();
+    HMaster master = cluster.getMaster();
+    // disable load balancing on this master
+    master.balanceSwitch(false);
+
+    final String table = "testMasterFailoverWhenDisablingTableRegionsInRITOnDeadRS";
+    byte [] FAMILY = Bytes.toBytes("family");
+    byte[][] SPLIT_KEYS =
+        new byte[][] {Bytes.toBytes("a"), Bytes.toBytes("b"), Bytes.toBytes("c"),
+            Bytes.toBytes("d") };
+    HTableDescriptor htd = new HTableDescriptor(table);
+    HColumnDescriptor hcd = new HColumnDescriptor(FAMILY);
+    htd.addFamily(hcd);
+    TESTUTIL.getHBaseAdmin().createTable(htd, SPLIT_KEYS);
+    AssignmentManager am = cluster.getMaster().getAssignmentManager();
+    List<HRegionInfo> regionsOfTable = null;
+    while ((regionsOfTable = am.getRegionsOfTable(table.getBytes())).size()
+        != (SPLIT_KEYS.length + 1)) {
+      Thread.sleep(10);
+    }
+    HRegionInfo closingRegion = regionsOfTable.get(0);
+    ServerName serverName = am.getRegionServerOfRegion(closingRegion);
+    HRegionServer deadRS = null;
+    for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) {
+      deadRS = cluster.getRegionServer(i);
+      if (deadRS.getServerName().equals(serverName)) {
+        break;
+      }
+    }
+
+    // Disable the table in ZK
+    ZKTable zkTable = am.getZKTable();
+    zkTable.setDisablingTable(table);
+    ZKAssign.createNodeClosing(master.getZooKeeper(), closingRegion, serverName);
+
+    // Stop the master
+    abortMaster(cluster);
+    master = startMasterAndWaitUntilLogSplit(cluster);
+    deadRS.kill();
+    deadRS.join();
+    waitUntilMasterIsInitialized(master);
+    am = cluster.getMaster().getAssignmentManager();
+    zkTable = am.getZKTable();
+    // wait for no more RIT
+    ZKAssign.blockUntilNoRIT(master.getZooKeeper());
+    while (!master.getAssignmentManager().getZKTable().isDisabledTable(table)) {
+      Thread.sleep(10);
+    }
+    assertTrue("Table should be disabled state.", zkTable.isDisabledTable(table));
+    HBaseAdmin admin = new HBaseAdmin(master.getConfiguration());
+    admin.deleteTable(table);
+  }
+
   private void abortMaster(MiniHBaseCluster cluster)
       throws InterruptedException {
     for (MasterThread mt : cluster.getLiveMasterThreads()) {