You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2012/08/23 07:56:30 UTC

svn commit: r1376374 - in /hbase/branches/0.94/src: main/java/org/apache/hadoop/hbase/master/ main/java/org/apache/hadoop/hbase/zookeeper/ test/java/org/apache/hadoop/hbase/ test/java/org/apache/hadoop/hbase/master/ test/java/org/apache/hadoop/hbase/re...

Author: stack
Date: Thu Aug 23 05:56:29 2012
New Revision: 1376374

URL: http://svn.apache.org/viewvc?rev=1376374&view=rev
Log:
HBASE-5549 Master can fail if ZooKeeper session expires

Modified:
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
    hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
    hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java
    hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java
    hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java
    hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java
    hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java Thu Aug 23 05:56:29 2012
@@ -122,97 +122,94 @@ class ActiveMasterManager extends ZooKee
    *
    * This also makes sure that we are watching the master znode so will be
    * notified if another master dies.
-   * @param startupStatus 
+   * @param startupStatus
    * @return True if no issue becoming active master else false if another
    * master was running or if some other problem (zookeeper, stop flag has been
    * set on this Master)
    */
   boolean blockUntilBecomingActiveMaster(MonitoredTask startupStatus,
-      ClusterStatusTracker clusterStatusTracker) {
-    startupStatus.setStatus("Trying to register in ZK as active master");
-    boolean cleanSetOfActiveMaster = true;
-    // Try to become the active master, watch if there is another master.
-    // Write out our ServerName as versioned bytes.
-    try {
-      String backupZNode = ZKUtil.joinZNode(
+    ClusterStatusTracker clusterStatusTracker) {
+    while (true) {
+      startupStatus.setStatus("Trying to register in ZK as active master");
+      // Try to become the active master, watch if there is another master.
+      // Write out our ServerName as versioned bytes.
+      try {
+        String backupZNode = ZKUtil.joinZNode(
           this.watcher.backupMasterAddressesZNode, this.sn.toString());
-      if (ZKUtil.createEphemeralNodeAndWatch(this.watcher,
+        if (ZKUtil.createEphemeralNodeAndWatch(this.watcher,
           this.watcher.masterAddressZNode, this.sn.getVersionedBytes())) {
-        // If we were a backup master before, delete our ZNode from the backup
-        // master directory since we are the active now
-        LOG.info("Deleting ZNode for " + backupZNode +
-                 " from backup master directory");
-        ZKUtil.deleteNodeFailSilent(this.watcher, backupZNode);
+          // If we were a backup master before, delete our ZNode from the backup
+          // master directory since we are the active now
+          LOG.info("Deleting ZNode for " + backupZNode +
+            " from backup master directory");
+          ZKUtil.deleteNodeFailSilent(this.watcher, backupZNode);
+
+          // We are the master, return
+          startupStatus.setStatus("Successfully registered as active master.");
+          this.clusterHasActiveMaster.set(true);
+          LOG.info("Master=" + this.sn);
+          return true;
+        }
 
-        // We are the master, return
-        startupStatus.setStatus("Successfully registered as active master.");
+        // There is another active master running elsewhere or this is a restart
+        // and the master ephemeral node has not expired yet.
         this.clusterHasActiveMaster.set(true);
-        LOG.info("Master=" + this.sn);
-        return cleanSetOfActiveMaster;
-      }
-      cleanSetOfActiveMaster = false;
 
-      // There is another active master running elsewhere or this is a restart
-      // and the master ephemeral node has not expired yet.
-      this.clusterHasActiveMaster.set(true);
-
-      /*
-       * Add a ZNode for ourselves in the backup master directory since we are
-       * not the active master.
-       *
-       * If we become the active master later, ActiveMasterManager will delete
-       * this node explicitly.  If we crash before then, ZooKeeper will delete
-       * this node for us since it is ephemeral.
-       */
-      LOG.info("Adding ZNode for " + backupZNode +
-               " in backup master directory");
-      ZKUtil.createEphemeralNodeAndWatch(this.watcher, backupZNode,
+        /*
+         * Add a ZNode for ourselves in the backup master directory since we are
+         * not the active master.
+         *
+         * If we become the active master later, ActiveMasterManager will delete
+         * this node explicitly.  If we crash before then, ZooKeeper will delete
+         * this node for us since it is ephemeral.
+         */
+        LOG.info("Adding ZNode for " + backupZNode +
+          " in backup master directory");
+        ZKUtil.createEphemeralNodeAndWatch(this.watcher, backupZNode,
           this.sn.getVersionedBytes());
 
-      String msg;
-      byte [] bytes =
-        ZKUtil.getDataAndWatch(this.watcher, this.watcher.masterAddressZNode);
-      if (bytes == null) {
-        msg = ("A master was detected, but went down before its address " +
-          "could be read.  Attempting to become the next active master");
-      } else {
-        ServerName currentMaster = ServerName.parseVersionedServerName(bytes);
-        if (ServerName.isSameHostnameAndPort(currentMaster, this.sn)) {
-          msg = ("Current master has this master's address, " +
-            currentMaster + "; master was restarted?  Waiting on znode " +
-            "to expire...");
-          // Hurry along the expiration of the znode.
-          ZKUtil.deleteNode(this.watcher, this.watcher.masterAddressZNode);
+        String msg;
+        byte [] bytes =
+          ZKUtil.getDataAndWatch(this.watcher, this.watcher.masterAddressZNode);
+        if (bytes == null) {
+          msg = ("A master was detected, but went down before its address " +
+            "could be read.  Attempting to become the next active master");
         } else {
-          msg = "Another master is the active master, " + currentMaster +
-          "; waiting to become the next active master";
+          ServerName currentMaster = ServerName.parseVersionedServerName(bytes);
+          if (ServerName.isSameHostnameAndPort(currentMaster, this.sn)) {
+            msg = ("Current master has this master's address, " +
+              currentMaster + "; master was restarted? Deleting node.");
+            // Hurry along the expiration of the znode.
+            ZKUtil.deleteNode(this.watcher, this.watcher.masterAddressZNode);
+          } else {
+            msg = "Another master is the active master, " + currentMaster +
+              "; waiting to become the next active master";
+          }
         }
-      }
-      LOG.info(msg);
-      startupStatus.setStatus(msg);
-    } catch (KeeperException ke) {
-      master.abort("Received an unexpected KeeperException, aborting", ke);
-      return false;
-    }
-    synchronized (this.clusterHasActiveMaster) {
-      while (this.clusterHasActiveMaster.get() && !this.master.isStopped()) {
-        try {
-          this.clusterHasActiveMaster.wait();
-        } catch (InterruptedException e) {
-          // We expect to be interrupted when a master dies, will fall out if so
-          LOG.debug("Interrupted waiting for master to die", e);
+        LOG.info(msg);
+        startupStatus.setStatus(msg);
+      } catch (KeeperException ke) {
+        master.abort("Received an unexpected KeeperException, aborting", ke);
+        return false;
+      }
+      synchronized (this.clusterHasActiveMaster) {
+        while (this.clusterHasActiveMaster.get() && !this.master.isStopped()) {
+          try {
+            this.clusterHasActiveMaster.wait();
+          } catch (InterruptedException e) {
+            // We expect to be interrupted when a master dies, will fall out if so
+            LOG.debug("Interrupted waiting for master to die", e);
+          }
         }
+        if (!clusterStatusTracker.isClusterUp()) {
+          this.master.stop("Cluster went down before this master became active");
+        }
+        if (this.master.isStopped()) {
+          return false;
+        }
+        // Try to become active master again now that there is no active master
       }
-      if (!clusterStatusTracker.isClusterUp()) {
-        this.master.stop("Cluster went down before this master became active");
-      }
-      if (this.master.isStopped()) {
-        return cleanSetOfActiveMaster;
-      }
-      // Try to become active master again now that there is no active master
-      cleanSetOfActiveMaster = blockUntilBecomingActiveMaster(startupStatus,clusterStatusTracker);
     }
-    return cleanSetOfActiveMaster;
   }
 
   /**

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Thu Aug 23 05:56:29 2012
@@ -1464,8 +1464,7 @@ Server {
   private boolean tryRecoveringExpiredZKSession() throws InterruptedException,
       IOException, KeeperException, ExecutionException {
 
-    this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":"
-      + this.serverName.getPort(), this, true);
+    this.zooKeeper.reconnectAfterExpiration();
 
     Callable<Boolean> callable = new Callable<Boolean> () {
       public Boolean call() throws InterruptedException,

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java Thu Aug 23 05:56:29 2012
@@ -69,7 +69,9 @@ public class RecoverableZooKeeper {
   // An identifier of this process in the cluster
   private final String identifier;
   private final byte[] id;
-  private int retryIntervalMillis;
+  private Watcher watcher;
+  private int sessionTimeout;
+  private String quorumServers;
 
   // The metadata attached to each piece of data has the
   // format:
@@ -84,18 +86,31 @@ public class RecoverableZooKeeper {
   private static final int ID_LENGTH_OFFSET = MAGIC_SIZE;
   private static final int ID_LENGTH_SIZE =  Bytes.SIZEOF_INT;
 
-  public RecoverableZooKeeper(String quorumServers, int seesionTimeout,
+  public RecoverableZooKeeper(String quorumServers, int sessionTimeout,
       Watcher watcher, int maxRetries, int retryIntervalMillis) 
   throws IOException {
-    this.zk = new ZooKeeper(quorumServers, seesionTimeout, watcher);
+    this.zk = new ZooKeeper(quorumServers, sessionTimeout, watcher);
     this.retryCounterFactory =
       new RetryCounterFactory(maxRetries, retryIntervalMillis);
-    this.retryIntervalMillis = retryIntervalMillis;
 
     // the identifier = processID@hostName
     this.identifier = ManagementFactory.getRuntimeMXBean().getName();
     LOG.info("The identifier of this process is " + identifier);
     this.id = Bytes.toBytes(identifier);
+    this.watcher = watcher;
+    this.sessionTimeout = sessionTimeout;
+    this.quorumServers = quorumServers;
+  }
+
+  public void reconnectAfterExpiration()
+        throws IOException, InterruptedException {
+    LOG.info("Closing dead ZooKeeper connection, session" +
+      " was: 0x"+Long.toHexString(zk.getSessionId()));
+    zk.close();
+    this.zk = new ZooKeeper(this.quorumServers,
+      this.sessionTimeout, this.watcher);
+    LOG.info("Recreated a ZooKeeper, session" +
+      " is: 0x"+Long.toHexString(zk.getSessionId()));
   }
 
   /**
@@ -124,6 +139,7 @@ public class RecoverableZooKeeper {
             throw e;
 
           case CONNECTIONLOSS:
+          case SESSIONEXPIRED:
           case OPERATIONTIMEOUT:
             retryOrThrow(retryCounter, e, "delete");
             break;
@@ -151,6 +167,7 @@ public class RecoverableZooKeeper {
       } catch (KeeperException e) {
         switch (e.code()) {
           case CONNECTIONLOSS:
+          case SESSIONEXPIRED:
           case OPERATIONTIMEOUT:
             retryOrThrow(retryCounter, e, "exists");
             break;
@@ -177,6 +194,7 @@ public class RecoverableZooKeeper {
       } catch (KeeperException e) {
         switch (e.code()) {
           case CONNECTIONLOSS:
+          case SESSIONEXPIRED:
           case OPERATIONTIMEOUT:
             retryOrThrow(retryCounter, e, "exists");
             break;
@@ -213,6 +231,7 @@ public class RecoverableZooKeeper {
       } catch (KeeperException e) {
         switch (e.code()) {
           case CONNECTIONLOSS:
+          case SESSIONEXPIRED:
           case OPERATIONTIMEOUT:
             retryOrThrow(retryCounter, e, "getChildren");
             break;
@@ -239,6 +258,7 @@ public class RecoverableZooKeeper {
       } catch (KeeperException e) {
         switch (e.code()) {
           case CONNECTIONLOSS:
+          case SESSIONEXPIRED:
           case OPERATIONTIMEOUT:
             retryOrThrow(retryCounter, e, "getChildren");
             break;
@@ -266,6 +286,7 @@ public class RecoverableZooKeeper {
       } catch (KeeperException e) {
         switch (e.code()) {
           case CONNECTIONLOSS:
+          case SESSIONEXPIRED:
           case OPERATIONTIMEOUT:
             retryOrThrow(retryCounter, e, "getData");
             break;
@@ -293,6 +314,7 @@ public class RecoverableZooKeeper {
       } catch (KeeperException e) {
         switch (e.code()) {
           case CONNECTIONLOSS:
+          case SESSIONEXPIRED:
           case OPERATIONTIMEOUT:
             retryOrThrow(retryCounter, e, "getData");
             break;
@@ -322,6 +344,7 @@ public class RecoverableZooKeeper {
       } catch (KeeperException e) {
         switch (e.code()) {
           case CONNECTIONLOSS:
+          case SESSIONEXPIRED:
           case OPERATIONTIMEOUT:
             retryOrThrow(retryCounter, e, "setData");
             break;
@@ -418,6 +441,7 @@ public class RecoverableZooKeeper {
             throw e;
 
           case CONNECTIONLOSS:
+          case SESSIONEXPIRED:
           case OPERATIONTIMEOUT:
             retryOrThrow(retryCounter, e, "create");
             break;
@@ -452,6 +476,7 @@ public class RecoverableZooKeeper {
       } catch (KeeperException e) {
         switch (e.code()) {
           case CONNECTIONLOSS:
+          case SESSIONEXPIRED:
           case OPERATIONTIMEOUT:
             retryOrThrow(retryCounter, e, "create");
             break;

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java Thu Aug 23 05:56:29 2012
@@ -243,6 +243,10 @@ public class ZooKeeperWatcher implements
     return recoverableZooKeeper;
   }
 
+  public void reconnectAfterExpiration() throws IOException, InterruptedException {
+    recoverableZooKeeper.reconnectAfterExpiration();
+  }
+
   /**
    * Get the quorum address of this instance.
    * @return quorum string of this zookeeper connection instance

Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java Thu Aug 23 05:56:29 2012
@@ -39,6 +39,7 @@ import java.util.Map;
 import java.util.NavigableSet;
 import java.util.Random;
 import java.util.UUID;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -71,6 +72,7 @@ import org.apache.hadoop.hbase.regionser
 import org.apache.hadoop.hbase.security.User;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.FSUtils;
+import org.apache.hadoop.hbase.util.JVMClusterUtil;
 import org.apache.hadoop.hbase.util.RegionSplitter;
 import org.apache.hadoop.hbase.util.Threads;
 import org.apache.hadoop.hbase.util.Writables;
@@ -84,6 +86,7 @@ import org.apache.hadoop.hdfs.MiniDFSClu
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.MiniMRCluster;
 import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.KeeperException.NodeExistsException;
 import org.apache.zookeeper.ZooKeeper;
 
@@ -1342,7 +1345,7 @@ public class HBaseTestingUtility {
    */
   public void expireMasterSession() throws Exception {
     HMaster master = hbaseCluster.getMaster();
-    expireSession(master.getZooKeeper(), master);
+    expireSession(master.getZooKeeper(), false);
   }
 
   /**
@@ -1352,16 +1355,22 @@ public class HBaseTestingUtility {
    */
   public void expireRegionServerSession(int index) throws Exception {
     HRegionServer rs = hbaseCluster.getRegionServer(index);
-    expireSession(rs.getZooKeeper(), rs);
+    expireSession(rs.getZooKeeper(), false);
   }
 
-  public void expireSession(ZooKeeperWatcher nodeZK, Server server)
+   /**
+    * Expire a ZooKeeper session as recommended in ZooKeeper documentation
+    * http://wiki.apache.org/hadoop/ZooKeeper/FAQ#A4
+    * There are issues when doing this:
+    * [1] http://www.mail-archive.com/dev@zookeeper.apache.org/msg01942.html
+    * [2] https://issues.apache.org/jira/browse/ZOOKEEPER-1105
+    *
+    * @param nodeZK - the ZK to make expiry
+    * @param checkStatus - true to check if the we can create a HTable with the
+    *                    current configuration.
+    */
+  public void expireSession(ZooKeeperWatcher nodeZK, boolean checkStatus)
     throws Exception {
-    expireSession(nodeZK, server, false);
-  }
-
-  public void expireSession(ZooKeeperWatcher nodeZK, Server server,
-      boolean checkStatus) throws Exception {
     Configuration c = new Configuration(this.conf);
     String quorumServers = ZKConfig.getZKQuorumServersString(c);
     int sessionTimeout = 500;
@@ -1369,14 +1378,29 @@ public class HBaseTestingUtility {
     byte[] password = zk.getSessionPasswd();
     long sessionID = zk.getSessionId();
 
+    // Expiry seems to be asynchronous (see comment from P. Hunt in [1]),
+    //  so we create a first watcher to be sure that the
+    //  event was sent. We expect that if our watcher receives the event
+    //  other watchers on the same machine will get is as well.
+    // When we ask to close the connection, ZK does not close it before
+    //  we receive all the events, so don't have to capture the event, just
+    //  closing the connection should be enough.
+    ZooKeeper monitor = new ZooKeeper(quorumServers,
+      1000, new org.apache.zookeeper.Watcher(){
+      @Override
+      public void process(WatchedEvent watchedEvent) {
+        LOG.info("Monitor ZKW received event="+watchedEvent);
+      }
+    } , sessionID, password);
+
+    // Making it expire
     ZooKeeper newZK = new ZooKeeper(quorumServers,
         sessionTimeout, EmptyWatcher.instance, sessionID, password);
     newZK.close();
-    final long sleep = 7000; // 7s seems enough to manage the timeout
-    LOG.info("ZK Closed Session 0x" + Long.toHexString(sessionID) +
-      "; sleeping=" + sleep);
+    LOG.info("ZK Closed Session 0x" + Long.toHexString(sessionID));
 
-    Thread.sleep(sleep);
+     // Now closing & waiting to be sure that the clients get it.
+     monitor.close();
 
     if (checkStatus) {
       new HTable(new Configuration(conf), HConstants.META_TABLE_NAME).close();
@@ -1545,7 +1569,7 @@ public class HBaseTestingUtility {
    * Make sure that at least the specified number of region servers
    * are running
    * @param num minimum number of region servers that should be running
-   * @return True if we started some servers
+   * @return true if we started some servers
    * @throws IOException
    */
   public boolean ensureSomeRegionServersAvailable(final int num)
@@ -1561,6 +1585,31 @@ public class HBaseTestingUtility {
   }
 
 
+  /**
+   * Make sure that at least the specified number of region servers
+   * are running. We don't count the ones that are currently stopping or are
+   * stopped.
+   * @param num minimum number of region servers that should be running
+   * @return true if we started some servers
+   * @throws IOException
+   */
+  public boolean ensureSomeNonStoppedRegionServersAvailable(final int num)
+    throws IOException {
+    boolean startedServer = ensureSomeRegionServersAvailable(num);
+
+    for (JVMClusterUtil.RegionServerThread rst :
+      hbaseCluster.getRegionServerThreads()) {
+
+      HRegionServer hrs = rst.getRegionServer();
+      if (hrs.isStopping() || hrs.isStopped()) {
+        LOG.info("A region server is stopped or stopping:"+hrs);
+        LOG.info("Started new server=" + hbaseCluster.startRegionServer());
+        startedServer = true;
+      }
+    }
+
+    return startedServer;
+  }
 
 
   /**

Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java Thu Aug 23 05:56:29 2012
@@ -26,6 +26,8 @@ import static org.junit.Assert.assertNul
 import static org.junit.Assert.fail;
 
 import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.SynchronousQueue;
 import java.util.concurrent.ThreadPoolExecutor;
@@ -40,6 +42,7 @@ import org.apache.hadoop.hbase.client.HC
 import org.apache.hadoop.hbase.client.HConnectionManager;
 import org.apache.hadoop.hbase.client.HTable;
 import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.master.HMaster;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.Threads;
 import org.apache.hadoop.hbase.zookeeper.ZKConfig;
@@ -97,6 +100,15 @@ public class TestZooKeeper {
     TEST_UTIL.ensureSomeRegionServersAvailable(2);
   }
 
+  private ZooKeeperWatcher getZooKeeperWatcher(HConnection c) throws
+    NoSuchMethodException, InvocationTargetException, IllegalAccessException {
+
+    Method getterZK = c.getClass().getMethod("getKeepAliveZooKeeperWatcher");
+    getterZK.setAccessible(true);
+
+    return (ZooKeeperWatcher) getterZK.invoke(c);
+  }
+
   /**
    * See HBASE-1232 and http://wiki.apache.org/hadoop/ZooKeeper/FAQ#4.
    * @throws IOException
@@ -111,7 +123,7 @@ public class TestZooKeeper {
     new HTable(c, HConstants.META_TABLE_NAME).close();
     HConnection connection = HConnectionManager.getConnection(c);
     ZooKeeperWatcher connectionZK = connection.getZooKeeperWatcher();
-    TEST_UTIL.expireSession(connectionZK, null);
+    TEST_UTIL.expireSession(connectionZK, false);
 
     // provoke session expiration by doing something with ZK
     ZKUtil.dump(connectionZK);
@@ -350,6 +362,21 @@ public class TestZooKeeper {
     ZKUtil.getChildDataAndWatchForNewChildren(zkw, "/wrongNode");
   }
 
+  /**
+   * Master recovery when the znode already exists. Internally, this
+   *  test differs from {@link #testMasterSessionExpired} because here
+   *  the master znode will exist in ZK.
+   */
+  @Test(timeout=20000)
+  public void testMasterZKSessionRecoveryFailure() throws Exception {
+    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+    HMaster m = cluster.getMaster();
+    m.abort("Test recovery from zk session expired",
+      new KeeperException.SessionExpiredException());
+    assertFalse(m.isStopped());
+    testSanity();
+  }
+
   @org.junit.Rule
   public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu =
     new org.apache.hadoop.hbase.ResourceCheckerJUnitRule();

Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java Thu Aug 23 05:56:29 2012
@@ -87,6 +87,8 @@ public class TestDistributedLogSplitting
     LOG.info("Starting cluster");
     conf = HBaseConfiguration.create();
     conf.getLong("hbase.splitlog.max.resubmit", 0);
+    // Make the failure test faster
+    conf.setInt("zookeeper.recovery.retry", 0);
     TEST_UTIL = new HBaseTestingUtility(conf);
     TEST_UTIL.startMiniCluster(NUM_MASTERS, num_rs);
     cluster = TEST_UTIL.getHBaseCluster();
@@ -245,7 +247,7 @@ public class TestDistributedLogSplitting
     slm.enqueueSplitTask(logfiles[0].getPath().toString(), batch);
     //waitForCounter but for one of the 2 counters
     long curt = System.currentTimeMillis();
-    long waitTime = 30000;
+    long waitTime = 80000;
     long endt = curt + waitTime;
     while (curt < endt) {
       if ((tot_wkr_task_resigned.get() + tot_wkr_task_err.get() + 

Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java Thu Aug 23 05:56:29 2012
@@ -80,40 +80,6 @@ public class TestMasterZKSessionRecovery
   }
 
   /**
-   * Negative test of master recovery from zk session expiry.
-   * <p>
-   * Starts with one master. Fakes the master zk session expired.
-   * The master should be able to come up if he is able to create
-   * the node as active master.
-   * @throws Exception
-   */
-  @Test(timeout=10000)
-  public void testMasterZKSessionRecoveryFailure() throws Exception {
-    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
-    HMaster m = cluster.getMaster();
-    m.abort("Test recovery from zk session expired",
-      new KeeperException.SessionExpiredException());
-    assertFalse(m.isStopped());
-  }
-
-  /**
-   * Positive test of master recovery from zk session expiry.
-   * <p>
-   * Starts with one master. Closes the master zk session.
-   * Ensures the master can recover the expired zk session.
-   * @throws Exception
-   */
-  @Test(timeout=60000)
-  public void testMasterZKSessionRecoverySuccess() throws Exception {
-    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
-    HMaster m = cluster.getMaster();
-    m.getZooKeeperWatcher().close();
-    m.abort("Test recovery from zk session expired",
-      new KeeperException.SessionExpiredException());
-    assertFalse(m.isStopped());
-  }
-  
-  /**
    * Tests that the master does not call retainAssignment after recovery from
    * expired zookeeper session. Without the HBASE-6046 fix master always tries
    * to assign all the user regions by calling retainAssignment.

Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java Thu Aug 23 05:56:29 2012
@@ -20,6 +20,7 @@
 package org.apache.hadoop.hbase.regionserver;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNotSame;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.assertFalse;
@@ -83,7 +84,7 @@ public class TestSplitTransactionOnClust
   }
 
   @Before public void setup() throws IOException {
-    TESTING_UTIL.ensureSomeRegionServersAvailable(NB_SERVERS);
+    TESTING_UTIL.ensureSomeNonStoppedRegionServersAvailable(NB_SERVERS);
     this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration());
     this.cluster = TESTING_UTIL.getMiniHBaseCluster();
   }
@@ -651,7 +652,10 @@ public class TestSplitTransactionOnClust
     HRegionServer tableRegionServer = cluster.getRegionServer(tableRegionIndex);
     if (metaRegionServer.getServerName().equals(tableRegionServer.getServerName())) {
       HRegionServer hrs = getOtherRegionServer(cluster, metaRegionServer);
-      LOG.info("Moving " + hri.getRegionNameAsString() + " to " +
+      assertNotNull(hrs);
+      assertNotNull(hri);
+      LOG.
+        info("Moving " + hri.getRegionNameAsString() + " to " +
         hrs.getServerName() + "; metaServerIndex=" + metaServerIndex);
       admin.move(hri.getEncodedNameAsBytes(),
         Bytes.toBytes(hrs.getServerName().toString()));

Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java Thu Aug 23 05:56:29 2012
@@ -100,6 +100,8 @@ public class TestReplication {
     conf1.setLong("replication.source.sleepforretries", 100);
     conf1.setInt("hbase.regionserver.maxlogs", 10);
     conf1.setLong("hbase.master.logcleaner.ttl", 10);
+    conf1.setInt("zookeeper.recovery.retry", 1);
+    conf1.setInt("zookeeper.recovery.retry.intervalmill", 10);
     conf1.setBoolean(HConstants.REPLICATION_ENABLE_KEY, true);
     conf1.setBoolean("dfs.support.append", true);
     conf1.setLong(HConstants.THREAD_WAKE_FREQUENCY, 100);
@@ -757,9 +759,11 @@ public class TestReplication {
 
     int lastCount = 0;
 
+    final long start = System.currentTimeMillis();
     for (int i = 0; i < NB_RETRIES; i++) {
       if (i==NB_RETRIES-1) {
-        fail("Waited too much time for queueFailover replication");
+        fail("Waited too much time for queueFailover replication. " +
+          "Waited "+(System.currentTimeMillis() - start)+"ms.");
       }
       Scan scan2 = new Scan();
       ResultScanner scanner2 = htable2.getScanner(scan2);