You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ra...@apache.org on 2012/05/14 20:14:38 UTC
svn commit: r1338325 - in /hbase/trunk/src:
main/java/org/apache/hadoop/hbase/ main/java/org/apache/hadoop/hbase/master/
main/java/org/apache/hadoop/hbase/master/handler/
test/java/org/apache/hadoop/hbase/regionserver/
Author: ramkrishna
Date: Mon May 14 18:14:37 2012
New Revision: 1338325
URL: http://svn.apache.org/viewvc?rev=1338325&view=rev
Log:
HBASE-5806 Handle split region related failures on master restart and RS restart (Chinna Rao)
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java?rev=1338325&r1=1338324&r2=1338325&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java Mon May 14 18:14:37 2012
@@ -71,7 +71,6 @@ public class LocalHBaseCluster {
/** 'local:' */
public static final String LOCAL_COLON = LOCAL + ":";
private final Configuration conf;
- private final Class<? extends HMaster> masterClass;
private final Class<? extends HRegionServer> regionServerClass;
/**
@@ -145,9 +144,6 @@ public class LocalHBaseCluster {
conf.set(HConstants.MASTER_PORT, "0");
conf.set(HConstants.REGIONSERVER_PORT, "0");
// Start the HMasters.
- this.masterClass =
- (Class<? extends HMaster>)conf.getClass(HConstants.MASTER_IMPL,
- masterClass);
for (int i = 0; i < noMasters; i++) {
addMaster(new Configuration(conf), i);
}
@@ -199,9 +195,8 @@ public class LocalHBaseCluster {
// Create each master with its own Configuration instance so each has
// its HConnection instance rather than share (see HBASE_INSTANCES down in
// the guts of HConnectionManager.
- JVMClusterUtil.MasterThread mt =
- JVMClusterUtil.createMasterThread(c,
- this.masterClass, index);
+ JVMClusterUtil.MasterThread mt = JVMClusterUtil.createMasterThread(c,
+ (Class<? extends HMaster>) c.getClass(HConstants.MASTER_IMPL, HMaster.class), index);
this.masterThreads.add(mt);
return mt;
}
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1338325&r1=1338324&r2=1338325&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Mon May 14 18:14:37 2012
@@ -609,9 +609,14 @@ public class AssignmentManager extends Z
}
failoverProcessedRegions.put(encodedRegionName, regionInfo);
break;
-
+ case RS_ZK_REGION_SPLITTING:
+ LOG.debug("Processed region in state : " + et);
+ break;
+ case RS_ZK_REGION_SPLIT:
+ LOG.debug("Processed region in state : " + et);
+ break;
default:
- throw new IllegalStateException("Received event is not valid.");
+ throw new IllegalStateException("Received region in state :" + et + " is not valid");
}
}
}
@@ -2547,6 +2552,19 @@ public class AssignmentManager extends Z
enableTableIfNotDisabledOrDisablingOrEnabling(disabled,
disablingOrEnabling, tableName);
} else {
+ // If region is in offline and split state check the ZKNode
+ if (regionInfo.isOffline() && regionInfo.isSplit()) {
+ String node = ZKAssign.getNodeName(this.watcher, regionInfo
+ .getEncodedName());
+ Stat stat = new Stat();
+ byte[] data = ZKUtil.getDataNoWatch(this.watcher, node, stat);
+ // If znode does not exist dont consider this region
+ if (data == null) {
+ LOG.debug("Region " + regionInfo.getRegionNameAsString()
+ + " split is completed. Hence need not add to regions list");
+ continue;
+ }
+ }
// Region is being served and on an active server
// add only if region not in disabled and enabling table
if (false == checkIfRegionBelongsToDisabled(regionInfo)
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1338325&r1=1338324&r2=1338325&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Mon May 14 18:14:37 2012
@@ -611,7 +611,7 @@ Server {
status.setStatus("Starting balancer and catalog janitor");
this.balancerChore = getAndStartBalancerChore(this);
this.catalogJanitorChore = new CatalogJanitor(this, this);
- Threads.setDaemonThreadRunning(catalogJanitorChore.getThread());
+ startCatalogJanitorChore();
registerMBean();
@@ -628,6 +628,14 @@ Server {
}
}
}
+
+ /**
+ * Useful for testing purpose also where we have
+ * master restart scenarios.
+ */
+ protected void startCatalogJanitorChore() {
+ Threads.setDaemonThreadRunning(catalogJanitorChore.getThread());
+ }
/**
* Override to change master's splitLogAfterStartup. Used testing
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java?rev=1338325&r1=1338324&r2=1338325&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java Mon May 14 18:14:37 2012
@@ -269,7 +269,7 @@ public class ServerShutdownHandler exten
// Skip regions that were in transition unless CLOSING or PENDING_CLOSE
for (RegionState rit : regionsInTransition) {
- if (!rit.isClosing() && !rit.isPendingClose()) {
+ if (!rit.isClosing() && !rit.isPendingClose() && !rit.isSplitting()) {
LOG.debug("Removed " + rit.getRegion().getRegionNameAsString() +
" from list of regions to assign because in RIT; region state: " +
rit.getState());
Modified: hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java?rev=1338325&r1=1338324&r2=1338325&view=diff
==============================================================================
--- hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java (original)
+++ hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java Mon May 14 18:14:37 2012
@@ -29,17 +29,20 @@ import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.executor.EventHandler.EventType;
+import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.handler.SplitRegionHandler;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
+import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.apache.zookeeper.data.Stat;
@@ -354,6 +357,178 @@ public class TestSplitTransactionOnClust
cluster.getMaster().setCatalogJanitorEnabled(true);
}
}
+
+ /**
+ * Verifies HBASE-5806. When splitting is partially done and the master goes down
+ * when the SPLIT node is in either SPLIT or SPLITTING state.
+ *
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws NodeExistsException
+ * @throws KeeperException
+ * @throws DeserializationException
+ */
+ @Test(timeout = 300000)
+ public void testMasterRestartWhenSplittingIsPartial()
+ throws IOException, InterruptedException, NodeExistsException,
+ KeeperException, DeserializationException {
+ final byte[] tableName = Bytes.toBytes("testMasterRestartWhenSplittingIsPartial");
+
+ // Create table then get the single region for our new table.
+ HTable t = TESTING_UTIL.createTable(tableName, HConstants.CATALOG_FAMILY);
+
+ List<HRegion> regions = cluster.getRegions(tableName);
+ HRegionInfo hri = getAndCheckSingleTableRegion(regions);
+
+ int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri);
+
+ // Turn off balancer so it doesn't cut in and mess up our placements.
+ this.admin.balanceSwitch(false);
+ // Turn off the meta scanner so it don't remove parent on us.
+ cluster.getMaster().setCatalogJanitorEnabled(false);
+ try {
+ // Add a bit of load up into the table so splittable.
+ TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY);
+ // Get region pre-split.
+ HRegionServer server = cluster.getRegionServer(tableRegionIndex);
+ printOutRegions(server, "Initial regions: ");
+ // Now, before we split, set special flag in master, a flag that has
+ // it FAIL the processing of split.
+ SplitRegionHandler.TEST_SKIP = true;
+ // Now try splitting and it should work.
+
+ this.admin.split(hri.getRegionNameAsString());
+ while (!(cluster.getRegions(tableName).size() >= 2)) {
+ LOG.debug("Waiting on region to split");
+ Thread.sleep(100);
+ }
+
+ // Get daughters
+ List<HRegion> daughters = cluster.getRegions(tableName);
+ assertTrue(daughters.size() >= 2);
+ // Assert the ephemeral node is up in zk.
+ String path = ZKAssign.getNodeName(t.getConnection()
+ .getZooKeeperWatcher(), hri.getEncodedName());
+ Stat stats = t.getConnection().getZooKeeperWatcher()
+ .getRecoverableZooKeeper().exists(path, false);
+ LOG.info("EPHEMERAL NODE BEFORE SERVER ABORT, path=" + path + ", stats="
+ + stats);
+ byte[] bytes = ZKAssign.getData(t.getConnection()
+ .getZooKeeperWatcher(), hri.getEncodedName());
+ RegionTransition rtd = RegionTransition.parseFrom(bytes);
+ // State could be SPLIT or SPLITTING.
+ assertTrue(rtd.getEventType().equals(EventType.RS_ZK_REGION_SPLIT)
+ || rtd.getEventType().equals(EventType.RS_ZK_REGION_SPLITTING));
+
+ // abort and wait for new master.
+ MockMasterWithoutCatalogJanitor master = abortAndWaitForMaster();
+
+ this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration());
+
+ // update the hri to be offlined and splitted.
+ hri.setOffline(true);
+ hri.setSplit(true);
+ ServerName regionServerOfRegion = master.getAssignmentManager()
+ .getRegionServerOfRegion(hri);
+ assertTrue(regionServerOfRegion != null);
+
+ } finally {
+ // Set this flag back.
+ SplitRegionHandler.TEST_SKIP = false;
+ admin.balanceSwitch(true);
+ cluster.getMaster().setCatalogJanitorEnabled(true);
+ }
+ }
+
+ /**
+ * Verifies HBASE-5806. Here the case is that splitting is completed but before the
+ * CJ could remove the parent region the master is killed and restarted.
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws NodeExistsException
+ * @throws KeeperException
+ */
+ @Test (timeout = 300000)
+ public void testMasterRestartAtRegionSplitPendingCatalogJanitor()
+ throws IOException, InterruptedException, NodeExistsException,
+ KeeperException {
+ final byte[] tableName = Bytes.toBytes("testMasterRestartAtRegionSplitPendingCatalogJanitor");
+
+ // Create table then get the single region for our new table.
+ HTable t = TESTING_UTIL.createTable(tableName, HConstants.CATALOG_FAMILY);
+
+ List<HRegion> regions = cluster.getRegions(tableName);
+ HRegionInfo hri = getAndCheckSingleTableRegion(regions);
+
+ int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri);
+
+ // Turn off balancer so it doesn't cut in and mess up our placements.
+ this.admin.balanceSwitch(false);
+ // Turn off the meta scanner so it don't remove parent on us.
+ cluster.getMaster().setCatalogJanitorEnabled(false);
+ try {
+ // Add a bit of load up into the table so splittable.
+ TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY);
+ // Get region pre-split.
+ HRegionServer server = cluster.getRegionServer(tableRegionIndex);
+ printOutRegions(server, "Initial regions: ");
+
+ this.admin.split(hri.getRegionNameAsString());
+ while (!(cluster.getRegions(tableName).size() >= 2)) {
+ LOG.debug("Waiting on region to split");
+ Thread.sleep(100);
+ }
+
+ // Get daughters
+ List<HRegion> daughters = cluster.getRegions(tableName);
+ assertTrue(daughters.size() >= 2);
+ // Assert the ephemeral node is up in zk.
+ String path = ZKAssign.getNodeName(t.getConnection()
+ .getZooKeeperWatcher(), hri.getEncodedName());
+ Stat stats = t.getConnection().getZooKeeperWatcher()
+ .getRecoverableZooKeeper().exists(path, false);
+ LOG.info("EPHEMERAL NODE BEFORE SERVER ABORT, path=" + path + ", stats="
+ + stats);
+ String node = ZKAssign.getNodeName(t.getConnection()
+ .getZooKeeperWatcher(), hri.getEncodedName());
+ Stat stat = new Stat();
+ byte[] data = ZKUtil.getDataNoWatch(t.getConnection()
+ .getZooKeeperWatcher(), node, stat);
+ // ZKUtil.create
+ while (data != null) {
+ Thread.sleep(1000);
+ data = ZKUtil.getDataNoWatch(t.getConnection().getZooKeeperWatcher(),
+ node, stat);
+
+ }
+ MockMasterWithoutCatalogJanitor master = abortAndWaitForMaster();
+
+ this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration());
+
+ hri.setOffline(true);
+ hri.setSplit(true);
+ ServerName regionServerOfRegion = master.getAssignmentManager()
+ .getRegionServerOfRegion(hri);
+ assertTrue(regionServerOfRegion == null);
+ } finally {
+ // Set this flag back.
+ SplitRegionHandler.TEST_SKIP = false;
+ this.admin.balanceSwitch(true);
+ cluster.getMaster().setCatalogJanitorEnabled(true);
+ }
+ }
+
+ private MockMasterWithoutCatalogJanitor abortAndWaitForMaster()
+ throws IOException, InterruptedException {
+ cluster.abortMaster(0);
+ cluster.waitOnMaster(0);
+ cluster.getConfiguration().setClass(HConstants.MASTER_IMPL,
+ MockMasterWithoutCatalogJanitor.class, HMaster.class);
+ MockMasterWithoutCatalogJanitor master = null;
+ master = (MockMasterWithoutCatalogJanitor) cluster.startMaster().getMaster();
+ cluster.waitForActiveAndReadyMaster();
+ return master;
+ }
private void split(final HRegionInfo hri, final HRegionServer server,
final int regionCount)
@@ -459,6 +634,18 @@ public class TestSplitTransactionOnClust
Thread.sleep(100);
}
}
+
+ public static class MockMasterWithoutCatalogJanitor extends HMaster {
+
+ public MockMasterWithoutCatalogJanitor(Configuration conf) throws IOException, KeeperException,
+ InterruptedException {
+ super(conf);
+ }
+
+ protected void startCatalogJanitorChore() {
+ LOG.debug("Customised master executed.");
+ }
+ }
@org.junit.Rule
public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu =