You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2011/04/27 02:01:31 UTC
svn commit: r1096970 - in /hbase/trunk: CHANGES.txt
src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
src/main/java/org/apache/hadoop/hbase/master/HMaster.java
src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java
Author: stack
Date: Wed Apr 27 00:01:31 2011
New Revision: 1096970
URL: http://svn.apache.org/viewvc?rev=1096970&view=rev
Log:
HBASE-3210 HBASE-1921 for the new master
Modified:
hbase/trunk/CHANGES.txt
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/trunk/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java
Modified: hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1096970&r1=1096969&r2=1096970&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Wed Apr 27 00:01:31 2011
@@ -87,6 +87,7 @@ Release 0.91.0 - Unreleased
(Prakash Khemani)
HBASE-3819 TestSplitLogWorker has too many SLWs running -- makes for
contention and occasional failures
+ HBASE-3210 HBASE-1921 for the new master
IMPROVEMENTS
HBASE-3290 Max Compaction Size (Nicolas Spiegelberg via Stack)
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1096970&r1=1096969&r2=1096970&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Wed Apr 27 00:01:31 2011
@@ -222,6 +222,10 @@ public class AssignmentManager extends Z
// Process list of dead servers
processDeadServers(deadServers);
// Check existing regions in transition
+ processRegionsInTransition();
+ }
+
+ public void processRegionsInTransition() throws KeeperException, IOException {
List<String> nodes = ZKUtil.listChildrenAndWatchForNewChildren(watcher,
watcher.assignmentZNode);
if (nodes.isEmpty()) {
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1096970&r1=1096969&r2=1096970&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Wed Apr 27 00:01:31 2011
@@ -276,10 +276,8 @@ implements HMasterInterface, HMasterRegi
* now wait until it dies to try and become the next active master. If we
* do not succeed on our first attempt, this is no longer a cluster startup.
*/
- this.activeMasterManager = new ActiveMasterManager(zooKeeper, address, this);
- this.zooKeeper.registerListener(activeMasterManager);
- stallIfBackupMaster(this.conf, this.activeMasterManager);
- this.activeMasterManager.blockUntilBecomingActiveMaster();
+ becomeActiveMaster();
+
// We are either the active master or we were asked to shutdown
if (!this.stopped) {
finishInitialization();
@@ -308,6 +306,52 @@ implements HMasterInterface, HMasterRegi
LOG.info("HMaster main thread exiting");
}
+ /**
+ * Try becoming active master.
+ * @return True if we could successfully become the active master.
+ * @throws InterruptedException
+ */
+ private boolean becomeActiveMaster() throws InterruptedException {
+ this.activeMasterManager = new ActiveMasterManager(zooKeeper, address,
+ this);
+ this.zooKeeper.registerListener(activeMasterManager);
+ stallIfBackupMaster(this.conf, this.activeMasterManager);
+ return this.activeMasterManager.blockUntilBecomingActiveMaster();
+ }
+
+ /**
+ * Initilize all ZK based system trackers.
+ * @throws IOException
+ * @throws InterruptedException
+ */
+ private void initializeZKBasedSystemTrackers() throws IOException,
+ InterruptedException, KeeperException {
+ this.catalogTracker = new CatalogTracker(this.zooKeeper, this.connection,
+ this, conf.getInt("hbase.master.catalog.timeout", Integer.MAX_VALUE));
+ this.catalogTracker.start();
+
+ this.assignmentManager = new AssignmentManager(this, serverManager,
+ this.catalogTracker, this.executorService);
+ this.balancer = new LoadBalancer(conf);
+ zooKeeper.registerListenerFirst(assignmentManager);
+
+ this.regionServerTracker = new RegionServerTracker(zooKeeper, this,
+ this.serverManager);
+ this.regionServerTracker.start();
+
+ // Set the cluster as up. If new RSs, they'll be waiting on this before
+ // going ahead with their startup.
+ this.clusterStatusTracker = new ClusterStatusTracker(getZooKeeper(), this);
+ this.clusterStatusTracker.start();
+ boolean wasUp = this.clusterStatusTracker.isClusterUp();
+ if (!wasUp) this.clusterStatusTracker.setClusterUp();
+
+ LOG.info("Server active/primary master; " + this.address +
+ ", sessionid=0x" +
+ Long.toHexString(this.zooKeeper.getZooKeeper().getSessionId()) +
+ ", cluster-up flag was=" + wasUp);
+ }
+
private void loop() {
// Check if we should stop every second.
Sleeper sleeper = new Sleeper(1000, this);
@@ -357,30 +401,7 @@ implements HMasterInterface, HMasterRegi
this.serverManager = new ServerManager(this, this, metrics);
- this.catalogTracker = new CatalogTracker(this.zooKeeper, this.connection,
- this, conf.getInt("hbase.master.catalog.timeout", Integer.MAX_VALUE));
- this.catalogTracker.start();
-
- this.assignmentManager = new AssignmentManager(this, serverManager,
- this.catalogTracker, this.executorService);
- this.balancer = new LoadBalancer(conf);
- zooKeeper.registerListenerFirst(assignmentManager);
-
- this.regionServerTracker = new RegionServerTracker(zooKeeper, this,
- this.serverManager);
- this.regionServerTracker.start();
-
- // Set the cluster as up. If new RSs, they'll be waiting on this before
- // going ahead with their startup.
- this.clusterStatusTracker = new ClusterStatusTracker(getZooKeeper(), this);
- this.clusterStatusTracker.start();
- boolean wasUp = this.clusterStatusTracker.isClusterUp();
- if (!wasUp) this.clusterStatusTracker.setClusterUp();
-
- LOG.info("Server active/primary master; " + this.address +
- ", sessionid=0x" +
- Long.toHexString(this.zooKeeper.getZooKeeper().getSessionId()) +
- ", cluster-up flag was=" + wasUp);
+ initializeZKBasedSystemTrackers();
// initialize master side coprocessors before we start handling requests
this.cpHost = new MasterCoprocessorHost(this, this.conf);
@@ -1089,10 +1110,67 @@ implements HMasterInterface, HMasterRegi
@Override
public void abort(final String msg, final Throwable t) {
- if (t != null) LOG.fatal(msg, t);
- else LOG.fatal(msg);
- this.abort = true;
- stop("Aborting");
+ if (abortNow(msg, t)) {
+ if (t != null) LOG.fatal(msg, t);
+ else LOG.fatal(msg);
+ this.abort = true;
+ stop("Aborting");
+ }
+ }
+
+ /**
+ * We do the following.
+ * 1. Create a new ZK session. (since our current one is expired)
+ * 2. Try to become a primary master again
+ * 3. Initialize all ZK based system trackers.
+ * 4. Assign root and meta. (they are already assigned, but we need to update our
+ * internal memory state to reflect it)
+ * 5. Process any RIT if any during the process of our recovery.
+ *
+ * @return True if we could successfully recover from ZK session expiry.
+ * @throws InterruptedException
+ * @throws IOException
+ */
+ private boolean tryRecoveringExpiredZKSession() throws InterruptedException,
+ IOException, KeeperException {
+ this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":"
+ + address.getPort(), this);
+
+ if (!becomeActiveMaster()) {
+ return false;
+ }
+ initializeZKBasedSystemTrackers();
+ // Update in-memory structures to reflect our earlier Root/Meta assignment.
+ assignRootAndMeta();
+ // process RIT if any
+ this.assignmentManager.processRegionsInTransition();
+ return true;
+ }
+
+ /**
+ * Check to see if the current trigger for abort is due to ZooKeeper session
+ * expiry, and If yes, whether we can recover from ZK session expiry.
+ *
+ * @param msg Original abort message
+ * @param t The cause for current abort request
+ * @return true if we should proceed with abort operation, false other wise.
+ */
+ private boolean abortNow(final String msg, final Throwable t) {
+ if (!this.isActiveMaster) {
+ return true;
+ }
+ if (t != null && t instanceof KeeperException.SessionExpiredException) {
+ try {
+ LOG.info("Primary Master trying to recover from ZooKeeper session " +
+ "expiry.");
+ return !tryRecoveringExpiredZKSession();
+ } catch (Throwable newT) {
+ LOG.error("Primary master encountered unexpected exception while " +
+ "trying to recover from ZooKeeper session" +
+ " expiry. Proceeding with server abort.", newT);
+ }
+ }
+ return true;
}
@Override
Modified: hbase/trunk/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java?rev=1096970&r1=1096969&r2=1096970&view=diff
==============================================================================
--- hbase/trunk/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java (original)
+++ hbase/trunk/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java Wed Apr 27 00:01:31 2011
@@ -131,8 +131,8 @@ public class TestZooKeeper {
testSanity();
}
- //@Test
- public void disabledTestMasterSessionExpired() throws Exception {
+ @Test
+ public void testMasterSessionExpired() throws Exception {
LOG.info("Starting testMasterSessionExpired");
TEST_UTIL.expireMasterSession();
testSanity();