You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2011/04/27 02:01:31 UTC

svn commit: r1096970 - in /hbase/trunk: CHANGES.txt src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java src/main/java/org/apache/hadoop/hbase/master/HMaster.java src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java

Author: stack
Date: Wed Apr 27 00:01:31 2011
New Revision: 1096970

URL: http://svn.apache.org/viewvc?rev=1096970&view=rev
Log:
HBASE-3210 HBASE-1921 for the new master

Modified:
    hbase/trunk/CHANGES.txt
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/trunk/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java

Modified: hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1096970&r1=1096969&r2=1096970&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Wed Apr 27 00:01:31 2011
@@ -87,6 +87,7 @@ Release 0.91.0 - Unreleased
                (Prakash Khemani)
    HBASE-3819  TestSplitLogWorker has too many SLWs running -- makes for
                contention and occasional failures
+   HBASE-3210  HBASE-1921 for the new master
 
   IMPROVEMENTS
    HBASE-3290  Max Compaction Size (Nicolas Spiegelberg via Stack)  

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1096970&r1=1096969&r2=1096970&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Wed Apr 27 00:01:31 2011
@@ -222,6 +222,10 @@ public class AssignmentManager extends Z
     // Process list of dead servers
     processDeadServers(deadServers);
     // Check existing regions in transition
+    processRegionsInTransition();
+  }
+
+  public void processRegionsInTransition() throws KeeperException, IOException {
     List<String> nodes = ZKUtil.listChildrenAndWatchForNewChildren(watcher,
         watcher.assignmentZNode);
     if (nodes.isEmpty()) {

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1096970&r1=1096969&r2=1096970&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Wed Apr 27 00:01:31 2011
@@ -276,10 +276,8 @@ implements HMasterInterface, HMasterRegi
        * now wait until it dies to try and become the next active master.  If we
        * do not succeed on our first attempt, this is no longer a cluster startup.
        */
-      this.activeMasterManager = new ActiveMasterManager(zooKeeper, address, this);
-      this.zooKeeper.registerListener(activeMasterManager);
-      stallIfBackupMaster(this.conf, this.activeMasterManager);
-      this.activeMasterManager.blockUntilBecomingActiveMaster();
+      becomeActiveMaster();
+
       // We are either the active master or we were asked to shutdown
       if (!this.stopped) {
         finishInitialization();
@@ -308,6 +306,52 @@ implements HMasterInterface, HMasterRegi
     LOG.info("HMaster main thread exiting");
   }
 
+  /**
+   * Try becoming active master.
+   * @return True if we could successfully become the active master.
+   * @throws InterruptedException
+   */
+  private boolean becomeActiveMaster() throws InterruptedException {
+    this.activeMasterManager = new ActiveMasterManager(zooKeeper, address,
+        this);
+    this.zooKeeper.registerListener(activeMasterManager);
+    stallIfBackupMaster(this.conf, this.activeMasterManager);
+    return this.activeMasterManager.blockUntilBecomingActiveMaster();
+  }
+
+  /**
+   * Initilize all ZK based system trackers.
+   * @throws IOException
+   * @throws InterruptedException
+   */
+  private void initializeZKBasedSystemTrackers() throws IOException,
+      InterruptedException, KeeperException {
+    this.catalogTracker = new CatalogTracker(this.zooKeeper, this.connection,
+        this, conf.getInt("hbase.master.catalog.timeout", Integer.MAX_VALUE));
+    this.catalogTracker.start();
+
+    this.assignmentManager = new AssignmentManager(this, serverManager,
+        this.catalogTracker, this.executorService);
+    this.balancer = new LoadBalancer(conf);
+    zooKeeper.registerListenerFirst(assignmentManager);
+
+    this.regionServerTracker = new RegionServerTracker(zooKeeper, this,
+        this.serverManager);
+    this.regionServerTracker.start();
+
+    // Set the cluster as up.  If new RSs, they'll be waiting on this before
+    // going ahead with their startup.
+    this.clusterStatusTracker = new ClusterStatusTracker(getZooKeeper(), this);
+    this.clusterStatusTracker.start();
+    boolean wasUp = this.clusterStatusTracker.isClusterUp();
+    if (!wasUp) this.clusterStatusTracker.setClusterUp();
+
+    LOG.info("Server active/primary master; " + this.address +
+        ", sessionid=0x" +
+        Long.toHexString(this.zooKeeper.getZooKeeper().getSessionId()) +
+        ", cluster-up flag was=" + wasUp);
+  }
+
   private void loop() {
     // Check if we should stop every second.
     Sleeper sleeper = new Sleeper(1000, this);
@@ -357,30 +401,7 @@ implements HMasterInterface, HMasterRegi
 
     this.serverManager = new ServerManager(this, this, metrics);
 
-    this.catalogTracker = new CatalogTracker(this.zooKeeper, this.connection,
-      this, conf.getInt("hbase.master.catalog.timeout", Integer.MAX_VALUE));
-    this.catalogTracker.start();
-
-    this.assignmentManager = new AssignmentManager(this, serverManager,
-      this.catalogTracker, this.executorService);
-    this.balancer = new LoadBalancer(conf);
-    zooKeeper.registerListenerFirst(assignmentManager);
-
-    this.regionServerTracker = new RegionServerTracker(zooKeeper, this,
-      this.serverManager);
-    this.regionServerTracker.start();
-
-    // Set the cluster as up.  If new RSs, they'll be waiting on this before
-    // going ahead with their startup.
-    this.clusterStatusTracker = new ClusterStatusTracker(getZooKeeper(), this);
-    this.clusterStatusTracker.start();
-    boolean wasUp = this.clusterStatusTracker.isClusterUp();
-    if (!wasUp) this.clusterStatusTracker.setClusterUp();
-
-    LOG.info("Server active/primary master; " + this.address +
-      ", sessionid=0x" +
-      Long.toHexString(this.zooKeeper.getZooKeeper().getSessionId()) +
-      ", cluster-up flag was=" + wasUp);
+    initializeZKBasedSystemTrackers();
 
     // initialize master side coprocessors before we start handling requests
     this.cpHost = new MasterCoprocessorHost(this, this.conf);
@@ -1089,10 +1110,67 @@ implements HMasterInterface, HMasterRegi
 
   @Override
   public void abort(final String msg, final Throwable t) {
-    if (t != null) LOG.fatal(msg, t);
-    else LOG.fatal(msg);
-    this.abort = true;
-    stop("Aborting");
+    if (abortNow(msg, t)) {
+      if (t != null) LOG.fatal(msg, t);
+      else LOG.fatal(msg);
+      this.abort = true;
+      stop("Aborting");
+    }
+  }
+
+  /**
+   * We do the following.
+   * 1. Create a new ZK session. (since our current one is expired)
+   * 2. Try to become a primary master again
+   * 3. Initialize all ZK based system trackers.
+   * 4. Assign root and meta. (they are already assigned, but we need to update our
+   * internal memory state to reflect it)
+   * 5. Process any RIT if any during the process of our recovery.
+   *
+   * @return True if we could successfully recover from ZK session expiry.
+   * @throws InterruptedException
+   * @throws IOException
+   */
+  private boolean tryRecoveringExpiredZKSession() throws InterruptedException,
+      IOException, KeeperException {
+    this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":"
+        + address.getPort(), this);
+
+    if (!becomeActiveMaster()) {
+      return false;
+    }
+    initializeZKBasedSystemTrackers();
+    // Update in-memory structures to reflect our earlier Root/Meta assignment.
+    assignRootAndMeta();
+    // process RIT if any
+    this.assignmentManager.processRegionsInTransition();
+    return true;
+  }
+
+  /**
+   * Check to see if the current trigger for abort is due to ZooKeeper session
+   * expiry, and If yes, whether we can recover from ZK session expiry.
+   *
+   * @param msg Original abort message
+   * @param t   The cause for current abort request
+   * @return true if we should proceed with abort operation, false other wise.
+   */
+  private boolean abortNow(final String msg, final Throwable t) {
+    if (!this.isActiveMaster) {
+      return true;
+    }
+    if (t != null && t instanceof KeeperException.SessionExpiredException) {
+      try {
+        LOG.info("Primary Master trying to recover from ZooKeeper session " +
+            "expiry.");
+        return !tryRecoveringExpiredZKSession();
+      } catch (Throwable newT) {
+        LOG.error("Primary master encountered unexpected exception while " +
+            "trying to recover from ZooKeeper session" +
+            " expiry. Proceeding with server abort.", newT);
+      }
+    }
+    return true;
   }
 
   @Override

Modified: hbase/trunk/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java?rev=1096970&r1=1096969&r2=1096970&view=diff
==============================================================================
--- hbase/trunk/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java (original)
+++ hbase/trunk/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java Wed Apr 27 00:01:31 2011
@@ -131,8 +131,8 @@ public class TestZooKeeper {
     testSanity();
   }
 
-  //@Test
-  public void disabledTestMasterSessionExpired() throws Exception {
+  @Test
+  public void testMasterSessionExpired() throws Exception {
     LOG.info("Starting testMasterSessionExpired");
     TEST_UTIL.expireMasterSession();
     testSanity();