You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ra...@apache.org on 2012/06/05 18:40:47 UTC
svn commit: r1346461 - in /hbase/branches/0.94/src:
main/java/org/apache/hadoop/hbase/master/
test/java/org/apache/hadoop/hbase/master/
Author: ramkrishna
Date: Tue Jun 5 16:40:47 2012
New Revision: 1346461
URL: http://svn.apache.org/viewvc?rev=1346461&view=rev
Log:
HBASE-6046 Master retry on ZK session expiry causes inconsistent region assignments. (Ashutosh)
Modified:
hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java
hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java
Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1346461&r1=1346460&r2=1346461&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue Jun 5 16:40:47 2012
@@ -341,7 +341,7 @@ Server {
// We are either the active master or we were asked to shutdown
if (!this.stopped) {
- finishInitialization(startupStatus);
+ finishInitialization(startupStatus, false);
loop();
}
} catch (Throwable t) {
@@ -459,12 +459,13 @@ Server {
* <li>Ensure assignment of root and meta regions<li>
* <li>Handle either fresh cluster start or master failover</li>
* </ol>
+ * @param masterRecovery
*
* @throws IOException
* @throws InterruptedException
* @throws KeeperException
*/
- private void finishInitialization(MonitoredTask status)
+ private void finishInitialization(MonitoredTask status, boolean masterRecovery)
throws IOException, InterruptedException, KeeperException {
isActiveMaster = true;
@@ -478,7 +479,7 @@ Server {
status.setStatus("Initializing Master file system");
this.masterActiveTime = System.currentTimeMillis();
// TODO: Do this using Dependency Injection, using PicoContainer, Guice or Spring.
- this.fileSystemManager = new MasterFileSystem(this, this, metrics);
+ this.fileSystemManager = new MasterFileSystem(this, this, metrics, masterRecovery);
this.tableDescriptors =
new FSTableDescriptors(this.fileSystemManager.getFileSystem(),
@@ -487,21 +488,24 @@ Server {
// publish cluster ID
status.setStatus("Publishing Cluster ID in ZooKeeper");
ClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId());
+ if (!masterRecovery) {
+ this.executorService = new ExecutorService(getServerName().toString());
+ this.serverManager = new ServerManager(this, this);
+ }
- this.executorService = new ExecutorService(getServerName().toString());
-
- this.serverManager = new ServerManager(this, this);
status.setStatus("Initializing ZK system trackers");
initializeZKBasedSystemTrackers();
-
- // initialize master side coprocessors before we start handling requests
- status.setStatus("Initializing master coprocessors");
- this.cpHost = new MasterCoprocessorHost(this, this.conf);
-
- // start up all service threads.
- status.setStatus("Initializing master service threads");
- startServiceThreads();
+
+ if (!masterRecovery) {
+ // initialize master side coprocessors before we start handling requests
+ status.setStatus("Initializing master coprocessors");
+ this.cpHost = new MasterCoprocessorHost(this, this.conf);
+
+ // start up all service threads.
+ status.setStatus("Initializing master service threads");
+ startServiceThreads();
+ }
// Wait for region servers to report in.
this.serverManager.waitForRegionServers(status);
@@ -514,8 +518,9 @@ Server {
this.serverManager.recordNewServer(sn, HServerLoad.EMPTY_HSERVERLOAD);
}
}
-
- this.assignmentManager.startTimeOutMonitor();
+ if (!masterRecovery) {
+ this.assignmentManager.startTimeOutMonitor();
+ }
// TODO: Should do this in background rather than block master startup
status.setStatus("Splitting logs after master startup");
splitLogAfterStartup(this.fileSystemManager);
@@ -543,13 +548,15 @@ Server {
status.setStatus("Fixing up missing daughters");
fixupDaughters(status);
- // Start balancer and meta catalog janitor after meta and regions have
- // been assigned.
- status.setStatus("Starting balancer and catalog janitor");
- this.balancerChore = getAndStartBalancerChore(this);
- this.catalogJanitorChore = new CatalogJanitor(this, this);
- startCatalogJanitorChore();
- registerMBean();
+ if (!masterRecovery) {
+ // Start balancer and meta catalog janitor after meta and regions have
+ // been assigned.
+ status.setStatus("Starting balancer and catalog janitor");
+ this.balancerChore = getAndStartBalancerChore(this);
+ this.catalogJanitorChore = new CatalogJanitor(this, this);
+ startCatalogJanitorChore();
+ registerMBean();
+ }
status.markComplete("Initialization successful");
LOG.info("Master has completed initialization");
@@ -560,12 +567,14 @@ Server {
// master initialization. See HBASE-5916.
this.serverManager.clearDeadServersWithSameHostNameAndPortOfOnlineServer();
- if (this.cpHost != null) {
- // don't let cp initialization errors kill the master
- try {
- this.cpHost.postStartMaster();
- } catch (IOException ioe) {
- LOG.error("Coprocessor postStartMaster() hook failed", ioe);
+ if (!masterRecovery) {
+ if (this.cpHost != null) {
+ // don't let cp initialization errors kill the master
+ try {
+ this.cpHost.postStartMaster();
+ } catch (IOException ioe) {
+ LOG.error("Coprocessor postStartMaster() hook failed", ioe);
+ }
}
}
}
@@ -1433,13 +1442,9 @@ Server {
if (!becomeActiveMaster(status)) {
return Boolean.FALSE;
}
- initializeZKBasedSystemTrackers();
- // Update in-memory structures to reflect our earlier Root/Meta assignment.
- assignRootAndMeta(status);
- // process RIT if any
- // TODO: Why does this not call AssignmentManager.joinCluster? Otherwise
- // we are not processing dead servers if any.
- assignmentManager.processDeadServersAndRegionsInTransition();
+ serverShutdownHandlerEnabled = false;
+ initialized = false;
+ finishInitialization(status, true);
return Boolean.TRUE;
} finally {
status.cleanup();
Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java?rev=1346461&r1=1346460&r2=1346461&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java Tue Jun 5 16:40:47 2012
@@ -81,7 +81,7 @@ public class MasterFileSystem {
private final MasterServices services;
public MasterFileSystem(Server master, MasterServices services,
- MasterMetrics metrics)
+ MasterMetrics metrics, boolean masterRecovery)
throws IOException {
this.conf = master.getConfiguration();
this.master = master;
@@ -103,7 +103,7 @@ public class MasterFileSystem {
if (this.distributedLogSplitting) {
this.splitLogManager = new SplitLogManager(master.getZooKeeper(),
master.getConfiguration(), master, master.getServerName().toString());
- this.splitLogManager.finishInitialization();
+ this.splitLogManager.finishInitialization(masterRecovery);
} else {
this.splitLogManager = null;
}
Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java?rev=1346461&r1=1346460&r2=1346461&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java Tue Jun 5 16:40:47 2012
@@ -182,9 +182,11 @@ public class SplitLogManager extends Zoo
stopper);
}
- public void finishInitialization() {
- Threads.setDaemonThreadRunning(timeoutMonitor.getThread(), serverName +
- ".splitLogManagerTimeoutMonitor");
+ public void finishInitialization(boolean masterRecovery) {
+ if (!masterRecovery) {
+ Threads.setDaemonThreadRunning(timeoutMonitor.getThread(), serverName
+ + ".splitLogManagerTimeoutMonitor");
+ }
// Watcher can be null during tests with Mock'd servers.
if (this.watcher != null) {
this.watcher.registerListener(this);
@@ -1196,4 +1198,11 @@ public class SplitLogManager extends Zoo
return statusMsg;
}
}
+
+ /**
+ * Completes the initialization
+ */
+ public void finishInitialization() {
+ finishInitialization(false);
+ }
}
Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java?rev=1346461&r1=1346460&r2=1346461&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java Tue Jun 5 16:40:47 2012
@@ -148,7 +148,7 @@ public class TestCatalogJanitor {
private final AssignmentManager asm;
MockMasterServices(final Server server) throws IOException {
- this.mfs = new MasterFileSystem(server, this, null);
+ this.mfs = new MasterFileSystem(server, this, null, false);
this.asm = Mockito.mock(AssignmentManager.class);
}
Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java?rev=1346461&r1=1346460&r2=1346461&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java Tue Jun 5 16:40:47 2012
@@ -19,13 +19,31 @@
*/
package org.apache.hadoop.hbase.master;
+import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HColumnDescriptor;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MediumTests;
import org.apache.hadoop.hbase.MiniHBaseCluster;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.client.HBaseAdmin;
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.ResultScanner;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.zookeeper.ZKAssign;
+import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.zookeeper.KeeperException;
import org.junit.After;
import org.junit.Before;
@@ -46,6 +64,8 @@ public class TestMasterZKSessionRecovery
static {
Configuration conf = TEST_UTIL.getConfiguration();
conf.setLong("hbase.master.zksession.recover.timeout", 50000);
+ conf.setClass(HConstants.HBASE_MASTER_LOADBALANCER_CLASS,
+ MockLoadBalancer.class, LoadBalancer.class);
}
@Before
@@ -92,5 +112,93 @@ public class TestMasterZKSessionRecovery
new KeeperException.SessionExpiredException());
assertFalse(m.isStopped());
}
+
+ /**
+ * Tests that the master does not call retainAssignment after recovery from
+ * expired zookeeper session. Without the HBASE-6046 fix master always tries
+ * to assign all the user regions by calling retainAssignment.
+ */
+ @Test
+ public void testRegionAssignmentAfterMasterRecoveryDueToZKExpiry() throws Exception {
+ MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+ cluster.startRegionServer();
+ HMaster m = cluster.getMaster();
+ // now the cluster is up. So assign some regions.
+ HBaseAdmin admin = new HBaseAdmin(TEST_UTIL.getConfiguration());
+ byte[][] SPLIT_KEYS = new byte[][] { Bytes.toBytes("a"), Bytes.toBytes("b"),
+ Bytes.toBytes("c"), Bytes.toBytes("d"), Bytes.toBytes("e"), Bytes.toBytes("f"),
+ Bytes.toBytes("g"), Bytes.toBytes("h"), Bytes.toBytes("i"), Bytes.toBytes("j") };
+
+ String tableName = "testRegionAssignmentAfterMasterRecoveryDueToZKExpiry";
+ admin.createTable(new HTableDescriptor(tableName), SPLIT_KEYS);
+ ZooKeeperWatcher zooKeeperWatcher = HBaseTestingUtility.getZooKeeperWatcher(TEST_UTIL);
+ ZKAssign.blockUntilNoRIT(zooKeeperWatcher);
+ m.getZooKeeperWatcher().close();
+ MockLoadBalancer.retainAssignCalled = false;
+ m.abort("Test recovery from zk session expired", new KeeperException.SessionExpiredException());
+ assertFalse(m.isStopped());
+ // The recovered master should not call retainAssignment, as it is not a
+ // clean startup.
+ assertFalse("Retain assignment should not be called", MockLoadBalancer.retainAssignCalled);
+ }
+
+ static class MockLoadBalancer extends DefaultLoadBalancer {
+ static boolean retainAssignCalled = false;
+
+ @Override
+ public Map<ServerName, List<HRegionInfo>> retainAssignment(
+ Map<HRegionInfo, ServerName> regions, List<ServerName> servers) {
+ retainAssignCalled = true;
+ return super.retainAssignment(regions, servers);
+ }
+ }
+
+ /**
+ * Tests whether the logs are split when master recovers from a expired
+ * zookeeper session and an RS goes down.
+ */
+ @Test(timeout = 60000)
+ public void testLogSplittingAfterMasterRecoveryDueToZKExpiry() throws IOException,
+ KeeperException, InterruptedException {
+ MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+ cluster.startRegionServer();
+ HMaster m = cluster.getMaster();
+ // now the cluster is up. So assign some regions.
+ HBaseAdmin admin = new HBaseAdmin(TEST_UTIL.getConfiguration());
+ byte[][] SPLIT_KEYS = new byte[][] { Bytes.toBytes("1"), Bytes.toBytes("2"),
+ Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5") };
+
+ String tableName = "testLogSplittingAfterMasterRecoveryDueToZKExpiry";
+ HTableDescriptor htd = new HTableDescriptor(tableName);
+ HColumnDescriptor hcd = new HColumnDescriptor("col");
+ htd.addFamily(hcd);
+ admin.createTable(htd, SPLIT_KEYS);
+ ZooKeeperWatcher zooKeeperWatcher = HBaseTestingUtility.getZooKeeperWatcher(TEST_UTIL);
+ ZKAssign.blockUntilNoRIT(zooKeeperWatcher);
+ HTable table = new HTable(TEST_UTIL.getConfiguration(), tableName);
+
+ Put p = null;
+ int numberOfPuts = 0;
+ for (numberOfPuts = 0; numberOfPuts < 6; numberOfPuts++) {
+ p = new Put(Bytes.toBytes(numberOfPuts));
+ p.add(Bytes.toBytes("col"), Bytes.toBytes("ql"), Bytes.toBytes("value" + numberOfPuts));
+ table.put(p);
+ }
+ m.getZooKeeperWatcher().close();
+ m.abort("Test recovery from zk session expired", new KeeperException.SessionExpiredException());
+ assertFalse(m.isStopped());
+ cluster.getRegionServer(0).abort("Aborting");
+ // Without patch for HBASE-6046 this test case will always timeout
+ // with patch the test case should pass.
+ Scan scan = new Scan();
+ int numberOfRows = 0;
+ ResultScanner scanner = table.getScanner(scan);
+ Result[] result = scanner.next(1);
+ while (result != null && result.length > 0) {
+ numberOfRows++;
+ result = scanner.next(1);
+ }
+ assertEquals("Number of rows should be equal to number of puts.", numberOfPuts, numberOfRows);
+ }
}