You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ra...@apache.org on 2012/05/14 20:34:00 UTC
svn commit: r1338335 - in /hbase/branches/0.92/src:
main/java/org/apache/hadoop/hbase/ main/java/org/apache/hadoop/hbase/master/
main/java/org/apache/hadoop/hbase/master/handler/
test/java/org/apache/hadoop/hbase/regionserver/
Author: ramkrishna
Date: Mon May 14 18:34:00 2012
New Revision: 1338335
URL: http://svn.apache.org/viewvc?rev=1338335&view=rev
Log:
HBASE-5806 Handle split region related failures on master restart and RS restart(Chinna rao)
Modified:
hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java
hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java
Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java?rev=1338335&r1=1338334&r2=1338335&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java Mon May 14 18:34:00 2012
@@ -67,7 +67,6 @@ public class LocalHBaseCluster {
/** 'local:' */
public static final String LOCAL_COLON = LOCAL + ":";
private final Configuration conf;
- private final Class<? extends HMaster> masterClass;
private final Class<? extends HRegionServer> regionServerClass;
/**
@@ -141,9 +140,6 @@ public class LocalHBaseCluster {
conf.set(HConstants.MASTER_PORT, "0");
conf.set(HConstants.REGIONSERVER_PORT, "0");
// Start the HMasters.
- this.masterClass =
- (Class<? extends HMaster>)conf.getClass(HConstants.MASTER_IMPL,
- masterClass);
for (int i = 0; i < noMasters; i++) {
addMaster(new Configuration(conf), i);
}
@@ -195,9 +191,8 @@ public class LocalHBaseCluster {
// Create each master with its own Configuration instance so each has
// its HConnection instance rather than share (see HBASE_INSTANCES down in
// the guts of HConnectionManager.
- JVMClusterUtil.MasterThread mt =
- JVMClusterUtil.createMasterThread(c,
- this.masterClass, index);
+ JVMClusterUtil.MasterThread mt = JVMClusterUtil.createMasterThread(c,
+ (Class<? extends HMaster>) c.getClass(HConstants.MASTER_IMPL, HMaster.class), index);
this.masterThreads.add(mt);
return mt;
}
Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1338335&r1=1338334&r2=1338335&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Mon May 14 18:34:00 2012
@@ -2404,6 +2404,20 @@ public class AssignmentManager extends Z
}
offlineRegions.add(new Pair<HRegionInfo,Result>(regionInfo, result));
} else {
+ // If region is in offline and split state check the ZKNode
+ if (regionInfo.isOffline() && regionInfo.isSplit()) {
+ String node = ZKAssign.getNodeName(this.watcher,
+ regionInfo.getEncodedName());
+ Stat stat = new Stat();
+ byte[] data = ZKUtil.getDataNoWatch(this.watcher, node, stat);
+ // If znode does not exist dont consider this region
+ if (data == null) {
+ LOG.debug("Region " + regionInfo.getRegionNameAsString()
+ + " split is completed. "
+ + "Hence need not add to regions list");
+ continue;
+ }
+ }
// Region is being served and on an active server
// add only if region not in disabled and enabling table
if (false == checkIfRegionBelongsToDisabled(regionInfo)
Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1338335&r1=1338334&r2=1338335&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Mon May 14 18:34:00 2012
@@ -548,7 +548,7 @@ Server {
status.setStatus("Starting balancer and catalog janitor");
this.balancerChore = getAndStartBalancerChore(this);
this.catalogJanitorChore = new CatalogJanitor(this, this);
- Threads.setDaemonThreadRunning(catalogJanitorChore.getThread());
+ startCatalogJanitorChore();
registerMBean();
@@ -565,7 +565,14 @@ Server {
}
}
}
-
+
+ /**
+ * Useful for testing purpose also where we have master restart scenarios.
+ */
+ protected void startCatalogJanitorChore() {
+ Threads.setDaemonThreadRunning(catalogJanitorChore.getThread());
+ }
+
/**
* Override to change master's splitLogAfterStartup. Used testing
* @param mfs
Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java?rev=1338335&r1=1338334&r2=1338335&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java Mon May 14 18:34:00 2012
@@ -265,7 +265,7 @@ public class ServerShutdownHandler exten
// Skip regions that were in transition unless CLOSING or PENDING_CLOSE
for (RegionState rit : regionsInTransition) {
- if (!rit.isClosing() && !rit.isPendingClose()) {
+ if (!rit.isClosing() && !rit.isPendingClose() && !rit.isSplitting()) {
LOG.debug("Removed " + rit.getRegion().getRegionNameAsString() +
" from list of regions to assign because in RIT" + " region state: "
+ rit.getState());
Modified: hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java?rev=1338335&r1=1338334&r2=1338335&view=diff
==============================================================================
--- hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java (original)
+++ hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java Mon May 14 18:34:00 2012
@@ -28,9 +28,12 @@ import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.ServerName;
@@ -41,11 +44,13 @@ import org.apache.hadoop.hbase.client.HB
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.executor.EventHandler.EventType;
import org.apache.hadoop.hbase.executor.RegionTransitionData;
+import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.handler.SplitRegionHandler;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
+import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.apache.zookeeper.data.Stat;
@@ -357,6 +362,175 @@ public class TestSplitTransactionOnClust
cluster.getMaster().setCatalogJanitorEnabled(true);
}
}
+ /**
+ * Verifies HBASE-5806. When splitting is partially done and the master goes down
+ * when the SPLIT node is in either SPLIT or SPLITTING state.
+ *
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws NodeExistsException
+ * @throws KeeperException
+ */
+ @Test(timeout = 300000)
+ public void testMasterRestartWhenSplittingIsPartial()
+ throws IOException, InterruptedException, NodeExistsException,
+ KeeperException {
+ final byte[] tableName = Bytes.toBytes("testMasterRestartWhenSplittingIsPartial");
+
+ // Create table then get the single region for our new table.
+ HTable t = TESTING_UTIL.createTable(tableName, HConstants.CATALOG_FAMILY);
+
+ List<HRegion> regions = cluster.getRegions(tableName);
+ HRegionInfo hri = getAndCheckSingleTableRegion(regions);
+
+ int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri);
+
+ // Turn off the meta scanner so it don't remove parent on us.
+ cluster.getMaster().setCatalogJanitorEnabled(false);
+ // Turn off balancer so it doesn't cut in and mess up our placements.
+ this.admin.balanceSwitch(false);
+
+ try {
+ // Add a bit of load up into the table so splittable.
+ TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY);
+ // Get region pre-split.
+ HRegionServer server = cluster.getRegionServer(tableRegionIndex);
+ printOutRegions(server, "Initial regions: ");
+ int regionCount = server.getOnlineRegions().size();
+ // Now, before we split, set special flag in master, a flag that has
+ // it FAIL the processing of split.
+ SplitRegionHandler.TEST_SKIP = true;
+ // Now try splitting and it should work.
+ split(hri, server, regionCount);
+ // Get daughters
+ List<HRegion> daughters = cluster.getRegions(tableName);
+ assertTrue(daughters.size() >= 2);
+ // Assert the ephemeral node is up in zk.
+ String path = ZKAssign.getNodeName(t.getConnection()
+ .getZooKeeperWatcher(), hri.getEncodedName());
+ Stat stats = t.getConnection().getZooKeeperWatcher()
+ .getRecoverableZooKeeper().exists(path, false);
+ LOG.info("EPHEMERAL NODE BEFORE SERVER ABORT, path=" + path + ", stats="
+ + stats);
+ RegionTransitionData rtd = ZKAssign.getData(t.getConnection()
+ .getZooKeeperWatcher(), hri.getEncodedName());
+ // State could be SPLIT or SPLITTING.
+ assertTrue(rtd.getEventType().equals(EventType.RS_ZK_REGION_SPLIT)
+ || rtd.getEventType().equals(EventType.RS_ZK_REGION_SPLITTING));
+
+
+ // abort and wait for new master.
+ MockMasterWithoutCatalogJanitor master = abortAndWaitForMaster();
+
+ this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration());
+
+ // update the hri to be offlined and splitted.
+ hri.setOffline(true);
+ hri.setSplit(true);
+ ServerName regionServerOfRegion = master.getAssignmentManager()
+ .getRegionServerOfRegion(hri);
+ assertTrue(regionServerOfRegion != null);
+
+ } finally {
+ // Set this flag back.
+ SplitRegionHandler.TEST_SKIP = false;
+ admin.balanceSwitch(true);
+ cluster.getMaster().setCatalogJanitorEnabled(true);
+ }
+ }
+
+
+ /**
+ * Verifies HBASE-5806. Here the case is that splitting is completed but before the
+ * CJ could remove the parent region the master is killed and restarted.
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws NodeExistsException
+ * @throws KeeperException
+ */
+ @Test (timeout = 300000)
+ public void testMasterRestartAtRegionSplitPendingCatalogJanitor()
+ throws IOException, InterruptedException, NodeExistsException,
+ KeeperException {
+ final byte[] tableName = Bytes.toBytes("testMasterRestartAtRegionSplitPendingCatalogJanitor");
+
+ // Create table then get the single region for our new table.
+ this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration());
+ HTableDescriptor htd = new HTableDescriptor(tableName);
+ HColumnDescriptor hcd = new HColumnDescriptor(HConstants.CATALOG_FAMILY);
+ htd.addFamily(hcd);
+ this.admin.createTable(htd);
+ HTable t = new HTable(TESTING_UTIL.getConfiguration(), tableName);
+
+ List<HRegion> regions = cluster.getRegions(tableName);
+ HRegionInfo hri = getAndCheckSingleTableRegion(regions);
+
+ int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri);
+
+ // Turn off balancer so it doesn't cut in and mess up our placements.
+ this.admin.balanceSwitch(false);
+ // Turn off the meta scanner so it don't remove parent on us.
+ cluster.getMaster().setCatalogJanitorEnabled(false);
+ try {
+ // Add a bit of load up into the table so splittable.
+ TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY);
+ // Get region pre-split.
+ HRegionServer server = cluster.getRegionServer(tableRegionIndex);
+ printOutRegions(server, "Initial regions: ");
+ int regionCount = server.getOnlineRegions().size();
+
+ split(hri, server, regionCount);
+ // Get daughters
+ List<HRegion> daughters = cluster.getRegions(tableName);
+ assertTrue(daughters.size() >= 2);
+ // Assert the ephemeral node is up in zk.
+ String path = ZKAssign.getNodeName(t.getConnection()
+ .getZooKeeperWatcher(), hri.getEncodedName());
+ Stat stats = t.getConnection().getZooKeeperWatcher()
+ .getRecoverableZooKeeper().exists(path, false);
+ LOG.info("EPHEMERAL NODE BEFORE SERVER ABORT, path=" + path + ", stats="
+ + stats);
+ String node = ZKAssign.getNodeName(t.getConnection()
+ .getZooKeeperWatcher(), hri.getEncodedName());
+ Stat stat = new Stat();
+ byte[] data = ZKUtil.getDataNoWatch(t.getConnection()
+ .getZooKeeperWatcher(), node, stat);
+ // ZKUtil.create
+ while (data != null) {
+ Thread.sleep(1000);
+ data = ZKUtil.getDataNoWatch(t.getConnection().getZooKeeperWatcher(),
+ node, stat);
+
+ }
+
+ MockMasterWithoutCatalogJanitor master = abortAndWaitForMaster();
+
+ this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration());
+
+ hri.setOffline(true);
+ hri.setSplit(true);
+ ServerName regionServerOfRegion = master.getAssignmentManager()
+ .getRegionServerOfRegion(hri);
+ assertTrue(regionServerOfRegion == null);
+ } finally {
+ // Set this flag back.
+ SplitRegionHandler.TEST_SKIP = false;
+ this.admin.balanceSwitch(true);
+ cluster.getMaster().setCatalogJanitorEnabled(true);
+ }
+ }
+
+ private MockMasterWithoutCatalogJanitor abortAndWaitForMaster()
+ throws IOException, InterruptedException {
+ cluster.abortMaster(0);
+ cluster.waitOnMaster(0);
+ cluster.getConfiguration().setClass(HConstants.MASTER_IMPL,
+ MockMasterWithoutCatalogJanitor.class, HMaster.class);
+ MockMasterWithoutCatalogJanitor master = null;
+ master = (MockMasterWithoutCatalogJanitor) cluster.startMaster().getMaster();
+ cluster.waitForActiveAndReadyMaster();
+ return master;
+ }
private void split(final HRegionInfo hri, final HRegionServer server,
final int regionCount)
@@ -459,4 +633,16 @@ public class TestSplitTransactionOnClust
Thread.sleep(100);
}
}
+
+ public static class MockMasterWithoutCatalogJanitor extends HMaster {
+
+ public MockMasterWithoutCatalogJanitor(Configuration conf)
+ throws IOException, KeeperException, InterruptedException {
+ super(conf);
+ }
+
+ protected void startCatalogJanitorChore() {
+ LOG.debug("Customised master executed.");
+ }
+ }
}