You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ra...@apache.org on 2012/05/14 20:34:00 UTC

svn commit: r1338335 - in /hbase/branches/0.92/src: main/java/org/apache/hadoop/hbase/ main/java/org/apache/hadoop/hbase/master/ main/java/org/apache/hadoop/hbase/master/handler/ test/java/org/apache/hadoop/hbase/regionserver/

Author: ramkrishna
Date: Mon May 14 18:34:00 2012
New Revision: 1338335

URL: http://svn.apache.org/viewvc?rev=1338335&view=rev
Log:
HBASE-5806 Handle split region related failures on master restart and RS restart(Chinna rao)

Modified:
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
    hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java?rev=1338335&r1=1338334&r2=1338335&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java Mon May 14 18:34:00 2012
@@ -67,7 +67,6 @@ public class LocalHBaseCluster {
   /** 'local:' */
   public static final String LOCAL_COLON = LOCAL + ":";
   private final Configuration conf;
-  private final Class<? extends HMaster> masterClass;
   private final Class<? extends HRegionServer> regionServerClass;
 
   /**
@@ -141,9 +140,6 @@ public class LocalHBaseCluster {
     conf.set(HConstants.MASTER_PORT, "0");
     conf.set(HConstants.REGIONSERVER_PORT, "0");
     // Start the HMasters.
-    this.masterClass =
-      (Class<? extends HMaster>)conf.getClass(HConstants.MASTER_IMPL,
-          masterClass);
     for (int i = 0; i < noMasters; i++) {
       addMaster(new Configuration(conf), i);
     }
@@ -195,9 +191,8 @@ public class LocalHBaseCluster {
     // Create each master with its own Configuration instance so each has
     // its HConnection instance rather than share (see HBASE_INSTANCES down in
     // the guts of HConnectionManager.
-    JVMClusterUtil.MasterThread mt =
-      JVMClusterUtil.createMasterThread(c,
-        this.masterClass, index);
+    JVMClusterUtil.MasterThread mt = JVMClusterUtil.createMasterThread(c,
+        (Class<? extends HMaster>) c.getClass(HConstants.MASTER_IMPL, HMaster.class), index);
     this.masterThreads.add(mt);
     return mt;
   }

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1338335&r1=1338334&r2=1338335&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Mon May 14 18:34:00 2012
@@ -2404,6 +2404,20 @@ public class AssignmentManager extends Z
         }
         offlineRegions.add(new Pair<HRegionInfo,Result>(regionInfo, result));
       } else {
+        // If region is in offline and split state check the ZKNode
+        if (regionInfo.isOffline() && regionInfo.isSplit()) {
+          String node = ZKAssign.getNodeName(this.watcher,
+              regionInfo.getEncodedName());
+          Stat stat = new Stat();
+          byte[] data = ZKUtil.getDataNoWatch(this.watcher, node, stat);
+          // If znode does not exist dont consider this region
+          if (data == null) {
+            LOG.debug("Region " + regionInfo.getRegionNameAsString()
+                + " split is completed. "
+                + "Hence need not add to regions list");
+            continue;
+          }
+        }
         // Region is being served and on an active server
         // add only if region not in disabled and enabling table
         if (false == checkIfRegionBelongsToDisabled(regionInfo)

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1338335&r1=1338334&r2=1338335&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Mon May 14 18:34:00 2012
@@ -548,7 +548,7 @@ Server {
     status.setStatus("Starting balancer and catalog janitor");
     this.balancerChore = getAndStartBalancerChore(this);
     this.catalogJanitorChore = new CatalogJanitor(this, this);
-    Threads.setDaemonThreadRunning(catalogJanitorChore.getThread());
+    startCatalogJanitorChore();
 
     registerMBean();
 
@@ -565,7 +565,14 @@ Server {
       }
     }
   }
-
+    
+  /**
+   * Useful for testing purpose also where we have master restart scenarios.
+   */
+  protected void startCatalogJanitorChore() {
+    Threads.setDaemonThreadRunning(catalogJanitorChore.getThread());
+  }
+    
   /**
    * Override to change master's splitLogAfterStartup. Used testing
    * @param mfs

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java?rev=1338335&r1=1338334&r2=1338335&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java Mon May 14 18:34:00 2012
@@ -265,7 +265,7 @@ public class ServerShutdownHandler exten
 
       // Skip regions that were in transition unless CLOSING or PENDING_CLOSE
       for (RegionState rit : regionsInTransition) {
-        if (!rit.isClosing() && !rit.isPendingClose()) {
+        if (!rit.isClosing() && !rit.isPendingClose() && !rit.isSplitting()) {
           LOG.debug("Removed " + rit.getRegion().getRegionNameAsString() +
           " from list of regions to assign because in RIT" + " region state: "
           + rit.getState());

Modified: hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java?rev=1338335&r1=1338334&r2=1338335&view=diff
==============================================================================
--- hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java (original)
+++ hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java Mon May 14 18:34:00 2012
@@ -28,9 +28,12 @@ import java.util.List;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HColumnDescriptor;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.HTableDescriptor;
 import org.apache.hadoop.hbase.MasterNotRunningException;
 import org.apache.hadoop.hbase.MiniHBaseCluster;
 import org.apache.hadoop.hbase.ServerName;
@@ -41,11 +44,13 @@ import org.apache.hadoop.hbase.client.HB
 import org.apache.hadoop.hbase.client.HTable;
 import org.apache.hadoop.hbase.executor.EventHandler.EventType;
 import org.apache.hadoop.hbase.executor.RegionTransitionData;
+import org.apache.hadoop.hbase.master.HMaster;
 import org.apache.hadoop.hbase.master.handler.SplitRegionHandler;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
 import org.apache.hadoop.hbase.util.Threads;
 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
+import org.apache.hadoop.hbase.zookeeper.ZKUtil;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.KeeperException.NodeExistsException;
 import org.apache.zookeeper.data.Stat;
@@ -357,6 +362,175 @@ public class TestSplitTransactionOnClust
       cluster.getMaster().setCatalogJanitorEnabled(true);
     }
   }
+  /**
+   * Verifies HBASE-5806.  When splitting is partially done and the master goes down
+   * when the SPLIT node is in either SPLIT or SPLITTING state.
+   * 
+   * @throws IOException
+   * @throws InterruptedException
+   * @throws NodeExistsException
+   * @throws KeeperException
+   */
+  @Test(timeout = 300000)
+  public void testMasterRestartWhenSplittingIsPartial()
+      throws IOException, InterruptedException, NodeExistsException,
+      KeeperException {
+    final byte[] tableName = Bytes.toBytes("testMasterRestartWhenSplittingIsPartial");
+
+    // Create table then get the single region for our new table.
+    HTable t = TESTING_UTIL.createTable(tableName, HConstants.CATALOG_FAMILY);
+
+    List<HRegion> regions = cluster.getRegions(tableName);
+    HRegionInfo hri = getAndCheckSingleTableRegion(regions);
+
+    int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri);
+
+    // Turn off the meta scanner so it don't remove parent on us.
+    cluster.getMaster().setCatalogJanitorEnabled(false);
+    // Turn off balancer so it doesn't cut in and mess up our placements.
+    this.admin.balanceSwitch(false);
+    
+    try {
+      // Add a bit of load up into the table so splittable.
+      TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY);
+      // Get region pre-split.
+      HRegionServer server = cluster.getRegionServer(tableRegionIndex);
+      printOutRegions(server, "Initial regions: ");
+      int regionCount = server.getOnlineRegions().size();
+      // Now, before we split, set special flag in master, a flag that has
+      // it FAIL the processing of split.
+      SplitRegionHandler.TEST_SKIP = true;
+      // Now try splitting and it should work.
+      split(hri, server, regionCount);
+      // Get daughters
+      List<HRegion> daughters = cluster.getRegions(tableName);
+      assertTrue(daughters.size() >= 2);
+      // Assert the ephemeral node is up in zk.
+      String path = ZKAssign.getNodeName(t.getConnection()
+          .getZooKeeperWatcher(), hri.getEncodedName());
+      Stat stats = t.getConnection().getZooKeeperWatcher()
+          .getRecoverableZooKeeper().exists(path, false);
+      LOG.info("EPHEMERAL NODE BEFORE SERVER ABORT, path=" + path + ", stats="
+          + stats);
+      RegionTransitionData rtd = ZKAssign.getData(t.getConnection()
+          .getZooKeeperWatcher(), hri.getEncodedName());
+      // State could be SPLIT or SPLITTING.
+      assertTrue(rtd.getEventType().equals(EventType.RS_ZK_REGION_SPLIT)
+          || rtd.getEventType().equals(EventType.RS_ZK_REGION_SPLITTING));
+
+
+      // abort and wait for new master.
+      MockMasterWithoutCatalogJanitor master = abortAndWaitForMaster();
+
+      this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration());
+
+      // update the hri to be offlined and splitted. 
+      hri.setOffline(true);
+      hri.setSplit(true);
+      ServerName regionServerOfRegion = master.getAssignmentManager()
+          .getRegionServerOfRegion(hri);
+      assertTrue(regionServerOfRegion != null);
+
+    } finally {
+      // Set this flag back.
+      SplitRegionHandler.TEST_SKIP = false;
+      admin.balanceSwitch(true);
+      cluster.getMaster().setCatalogJanitorEnabled(true);
+    }
+  }
+
+
+  /**
+   * Verifies HBASE-5806.  Here the case is that splitting is completed but before the
+   * CJ could remove the parent region the master is killed and restarted.
+   * @throws IOException
+   * @throws InterruptedException
+   * @throws NodeExistsException
+   * @throws KeeperException
+   */
+  @Test (timeout = 300000)
+  public void testMasterRestartAtRegionSplitPendingCatalogJanitor()
+      throws IOException, InterruptedException, NodeExistsException,
+      KeeperException {
+    final byte[] tableName = Bytes.toBytes("testMasterRestartAtRegionSplitPendingCatalogJanitor");
+
+    // Create table then get the single region for our new table.
+    this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration());
+    HTableDescriptor htd = new HTableDescriptor(tableName);
+    HColumnDescriptor hcd = new HColumnDescriptor(HConstants.CATALOG_FAMILY);
+    htd.addFamily(hcd);
+    this.admin.createTable(htd);
+    HTable t = new HTable(TESTING_UTIL.getConfiguration(), tableName);
+
+    List<HRegion> regions = cluster.getRegions(tableName);
+    HRegionInfo hri = getAndCheckSingleTableRegion(regions);
+
+    int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri);
+
+    // Turn off balancer so it doesn't cut in and mess up our placements.
+    this.admin.balanceSwitch(false);
+    // Turn off the meta scanner so it don't remove parent on us.
+    cluster.getMaster().setCatalogJanitorEnabled(false);
+    try {
+      // Add a bit of load up into the table so splittable.
+      TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY);
+      // Get region pre-split.
+      HRegionServer server = cluster.getRegionServer(tableRegionIndex);
+      printOutRegions(server, "Initial regions: ");
+      int regionCount = server.getOnlineRegions().size();
+      
+      split(hri, server, regionCount);
+      // Get daughters
+      List<HRegion> daughters = cluster.getRegions(tableName);
+      assertTrue(daughters.size() >= 2);
+      // Assert the ephemeral node is up in zk.
+      String path = ZKAssign.getNodeName(t.getConnection()
+          .getZooKeeperWatcher(), hri.getEncodedName());
+      Stat stats = t.getConnection().getZooKeeperWatcher()
+          .getRecoverableZooKeeper().exists(path, false);
+      LOG.info("EPHEMERAL NODE BEFORE SERVER ABORT, path=" + path + ", stats="
+          + stats);
+      String node = ZKAssign.getNodeName(t.getConnection()
+          .getZooKeeperWatcher(), hri.getEncodedName());
+      Stat stat = new Stat();
+      byte[] data = ZKUtil.getDataNoWatch(t.getConnection()
+          .getZooKeeperWatcher(), node, stat);
+      // ZKUtil.create
+      while (data != null) {
+        Thread.sleep(1000);
+        data = ZKUtil.getDataNoWatch(t.getConnection().getZooKeeperWatcher(),
+            node, stat);
+
+      }
+      
+      MockMasterWithoutCatalogJanitor master = abortAndWaitForMaster();
+
+      this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration());
+
+      hri.setOffline(true);
+      hri.setSplit(true);
+      ServerName regionServerOfRegion = master.getAssignmentManager()
+          .getRegionServerOfRegion(hri);
+      assertTrue(regionServerOfRegion == null);
+    } finally {
+      // Set this flag back.
+      SplitRegionHandler.TEST_SKIP = false;
+      this.admin.balanceSwitch(true);
+      cluster.getMaster().setCatalogJanitorEnabled(true);
+    }
+  }
+
+  private MockMasterWithoutCatalogJanitor abortAndWaitForMaster() 
+  throws IOException, InterruptedException {
+    cluster.abortMaster(0);
+    cluster.waitOnMaster(0);
+    cluster.getConfiguration().setClass(HConstants.MASTER_IMPL, 
+        MockMasterWithoutCatalogJanitor.class, HMaster.class);
+    MockMasterWithoutCatalogJanitor master = null;
+    master = (MockMasterWithoutCatalogJanitor) cluster.startMaster().getMaster();
+    cluster.waitForActiveAndReadyMaster();
+    return master;
+  }
 
   private void split(final HRegionInfo hri, final HRegionServer server,
       final int regionCount)
@@ -459,4 +633,16 @@ public class TestSplitTransactionOnClust
       Thread.sleep(100);
     }
   }
+  
+  public static class MockMasterWithoutCatalogJanitor extends HMaster {
+
+    public MockMasterWithoutCatalogJanitor(Configuration conf)
+        throws IOException, KeeperException, InterruptedException {
+      super(conf);
+    }
+
+    protected void startCatalogJanitorChore() {
+      LOG.debug("Customised master executed.");
+    }
+  }
 }