You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2012/08/23 07:56:30 UTC
svn commit: r1376374 - in /hbase/branches/0.94/src:
main/java/org/apache/hadoop/hbase/master/
main/java/org/apache/hadoop/hbase/zookeeper/
test/java/org/apache/hadoop/hbase/ test/java/org/apache/hadoop/hbase/master/
test/java/org/apache/hadoop/hbase/re...
Author: stack
Date: Thu Aug 23 05:56:29 2012
New Revision: 1376374
URL: http://svn.apache.org/viewvc?rev=1376374&view=rev
Log:
HBASE-5549 Master can fail if ZooKeeper session expires
Modified:
hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java
hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java
hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java
hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java
hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java
Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java Thu Aug 23 05:56:29 2012
@@ -122,97 +122,94 @@ class ActiveMasterManager extends ZooKee
*
* This also makes sure that we are watching the master znode so will be
* notified if another master dies.
- * @param startupStatus
+ * @param startupStatus
* @return True if no issue becoming active master else false if another
* master was running or if some other problem (zookeeper, stop flag has been
* set on this Master)
*/
boolean blockUntilBecomingActiveMaster(MonitoredTask startupStatus,
- ClusterStatusTracker clusterStatusTracker) {
- startupStatus.setStatus("Trying to register in ZK as active master");
- boolean cleanSetOfActiveMaster = true;
- // Try to become the active master, watch if there is another master.
- // Write out our ServerName as versioned bytes.
- try {
- String backupZNode = ZKUtil.joinZNode(
+ ClusterStatusTracker clusterStatusTracker) {
+ while (true) {
+ startupStatus.setStatus("Trying to register in ZK as active master");
+ // Try to become the active master, watch if there is another master.
+ // Write out our ServerName as versioned bytes.
+ try {
+ String backupZNode = ZKUtil.joinZNode(
this.watcher.backupMasterAddressesZNode, this.sn.toString());
- if (ZKUtil.createEphemeralNodeAndWatch(this.watcher,
+ if (ZKUtil.createEphemeralNodeAndWatch(this.watcher,
this.watcher.masterAddressZNode, this.sn.getVersionedBytes())) {
- // If we were a backup master before, delete our ZNode from the backup
- // master directory since we are the active now
- LOG.info("Deleting ZNode for " + backupZNode +
- " from backup master directory");
- ZKUtil.deleteNodeFailSilent(this.watcher, backupZNode);
+ // If we were a backup master before, delete our ZNode from the backup
+ // master directory since we are the active now
+ LOG.info("Deleting ZNode for " + backupZNode +
+ " from backup master directory");
+ ZKUtil.deleteNodeFailSilent(this.watcher, backupZNode);
+
+ // We are the master, return
+ startupStatus.setStatus("Successfully registered as active master.");
+ this.clusterHasActiveMaster.set(true);
+ LOG.info("Master=" + this.sn);
+ return true;
+ }
- // We are the master, return
- startupStatus.setStatus("Successfully registered as active master.");
+ // There is another active master running elsewhere or this is a restart
+ // and the master ephemeral node has not expired yet.
this.clusterHasActiveMaster.set(true);
- LOG.info("Master=" + this.sn);
- return cleanSetOfActiveMaster;
- }
- cleanSetOfActiveMaster = false;
- // There is another active master running elsewhere or this is a restart
- // and the master ephemeral node has not expired yet.
- this.clusterHasActiveMaster.set(true);
-
- /*
- * Add a ZNode for ourselves in the backup master directory since we are
- * not the active master.
- *
- * If we become the active master later, ActiveMasterManager will delete
- * this node explicitly. If we crash before then, ZooKeeper will delete
- * this node for us since it is ephemeral.
- */
- LOG.info("Adding ZNode for " + backupZNode +
- " in backup master directory");
- ZKUtil.createEphemeralNodeAndWatch(this.watcher, backupZNode,
+ /*
+ * Add a ZNode for ourselves in the backup master directory since we are
+ * not the active master.
+ *
+ * If we become the active master later, ActiveMasterManager will delete
+ * this node explicitly. If we crash before then, ZooKeeper will delete
+ * this node for us since it is ephemeral.
+ */
+ LOG.info("Adding ZNode for " + backupZNode +
+ " in backup master directory");
+ ZKUtil.createEphemeralNodeAndWatch(this.watcher, backupZNode,
this.sn.getVersionedBytes());
- String msg;
- byte [] bytes =
- ZKUtil.getDataAndWatch(this.watcher, this.watcher.masterAddressZNode);
- if (bytes == null) {
- msg = ("A master was detected, but went down before its address " +
- "could be read. Attempting to become the next active master");
- } else {
- ServerName currentMaster = ServerName.parseVersionedServerName(bytes);
- if (ServerName.isSameHostnameAndPort(currentMaster, this.sn)) {
- msg = ("Current master has this master's address, " +
- currentMaster + "; master was restarted? Waiting on znode " +
- "to expire...");
- // Hurry along the expiration of the znode.
- ZKUtil.deleteNode(this.watcher, this.watcher.masterAddressZNode);
+ String msg;
+ byte [] bytes =
+ ZKUtil.getDataAndWatch(this.watcher, this.watcher.masterAddressZNode);
+ if (bytes == null) {
+ msg = ("A master was detected, but went down before its address " +
+ "could be read. Attempting to become the next active master");
} else {
- msg = "Another master is the active master, " + currentMaster +
- "; waiting to become the next active master";
+ ServerName currentMaster = ServerName.parseVersionedServerName(bytes);
+ if (ServerName.isSameHostnameAndPort(currentMaster, this.sn)) {
+ msg = ("Current master has this master's address, " +
+ currentMaster + "; master was restarted? Deleting node.");
+ // Hurry along the expiration of the znode.
+ ZKUtil.deleteNode(this.watcher, this.watcher.masterAddressZNode);
+ } else {
+ msg = "Another master is the active master, " + currentMaster +
+ "; waiting to become the next active master";
+ }
}
- }
- LOG.info(msg);
- startupStatus.setStatus(msg);
- } catch (KeeperException ke) {
- master.abort("Received an unexpected KeeperException, aborting", ke);
- return false;
- }
- synchronized (this.clusterHasActiveMaster) {
- while (this.clusterHasActiveMaster.get() && !this.master.isStopped()) {
- try {
- this.clusterHasActiveMaster.wait();
- } catch (InterruptedException e) {
- // We expect to be interrupted when a master dies, will fall out if so
- LOG.debug("Interrupted waiting for master to die", e);
+ LOG.info(msg);
+ startupStatus.setStatus(msg);
+ } catch (KeeperException ke) {
+ master.abort("Received an unexpected KeeperException, aborting", ke);
+ return false;
+ }
+ synchronized (this.clusterHasActiveMaster) {
+ while (this.clusterHasActiveMaster.get() && !this.master.isStopped()) {
+ try {
+ this.clusterHasActiveMaster.wait();
+ } catch (InterruptedException e) {
+ // We expect to be interrupted when a master dies, will fall out if so
+ LOG.debug("Interrupted waiting for master to die", e);
+ }
}
+ if (!clusterStatusTracker.isClusterUp()) {
+ this.master.stop("Cluster went down before this master became active");
+ }
+ if (this.master.isStopped()) {
+ return false;
+ }
+ // Try to become active master again now that there is no active master
}
- if (!clusterStatusTracker.isClusterUp()) {
- this.master.stop("Cluster went down before this master became active");
- }
- if (this.master.isStopped()) {
- return cleanSetOfActiveMaster;
- }
- // Try to become active master again now that there is no active master
- cleanSetOfActiveMaster = blockUntilBecomingActiveMaster(startupStatus,clusterStatusTracker);
}
- return cleanSetOfActiveMaster;
}
/**
Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Thu Aug 23 05:56:29 2012
@@ -1464,8 +1464,7 @@ Server {
private boolean tryRecoveringExpiredZKSession() throws InterruptedException,
IOException, KeeperException, ExecutionException {
- this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":"
- + this.serverName.getPort(), this, true);
+ this.zooKeeper.reconnectAfterExpiration();
Callable<Boolean> callable = new Callable<Boolean> () {
public Boolean call() throws InterruptedException,
Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java Thu Aug 23 05:56:29 2012
@@ -69,7 +69,9 @@ public class RecoverableZooKeeper {
// An identifier of this process in the cluster
private final String identifier;
private final byte[] id;
- private int retryIntervalMillis;
+ private Watcher watcher;
+ private int sessionTimeout;
+ private String quorumServers;
// The metadata attached to each piece of data has the
// format:
@@ -84,18 +86,31 @@ public class RecoverableZooKeeper {
private static final int ID_LENGTH_OFFSET = MAGIC_SIZE;
private static final int ID_LENGTH_SIZE = Bytes.SIZEOF_INT;
- public RecoverableZooKeeper(String quorumServers, int seesionTimeout,
+ public RecoverableZooKeeper(String quorumServers, int sessionTimeout,
Watcher watcher, int maxRetries, int retryIntervalMillis)
throws IOException {
- this.zk = new ZooKeeper(quorumServers, seesionTimeout, watcher);
+ this.zk = new ZooKeeper(quorumServers, sessionTimeout, watcher);
this.retryCounterFactory =
new RetryCounterFactory(maxRetries, retryIntervalMillis);
- this.retryIntervalMillis = retryIntervalMillis;
// the identifier = processID@hostName
this.identifier = ManagementFactory.getRuntimeMXBean().getName();
LOG.info("The identifier of this process is " + identifier);
this.id = Bytes.toBytes(identifier);
+ this.watcher = watcher;
+ this.sessionTimeout = sessionTimeout;
+ this.quorumServers = quorumServers;
+ }
+
+ public void reconnectAfterExpiration()
+ throws IOException, InterruptedException {
+ LOG.info("Closing dead ZooKeeper connection, session" +
+ " was: 0x"+Long.toHexString(zk.getSessionId()));
+ zk.close();
+ this.zk = new ZooKeeper(this.quorumServers,
+ this.sessionTimeout, this.watcher);
+ LOG.info("Recreated a ZooKeeper, session" +
+ " is: 0x"+Long.toHexString(zk.getSessionId()));
}
/**
@@ -124,6 +139,7 @@ public class RecoverableZooKeeper {
throw e;
case CONNECTIONLOSS:
+ case SESSIONEXPIRED:
case OPERATIONTIMEOUT:
retryOrThrow(retryCounter, e, "delete");
break;
@@ -151,6 +167,7 @@ public class RecoverableZooKeeper {
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
+ case SESSIONEXPIRED:
case OPERATIONTIMEOUT:
retryOrThrow(retryCounter, e, "exists");
break;
@@ -177,6 +194,7 @@ public class RecoverableZooKeeper {
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
+ case SESSIONEXPIRED:
case OPERATIONTIMEOUT:
retryOrThrow(retryCounter, e, "exists");
break;
@@ -213,6 +231,7 @@ public class RecoverableZooKeeper {
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
+ case SESSIONEXPIRED:
case OPERATIONTIMEOUT:
retryOrThrow(retryCounter, e, "getChildren");
break;
@@ -239,6 +258,7 @@ public class RecoverableZooKeeper {
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
+ case SESSIONEXPIRED:
case OPERATIONTIMEOUT:
retryOrThrow(retryCounter, e, "getChildren");
break;
@@ -266,6 +286,7 @@ public class RecoverableZooKeeper {
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
+ case SESSIONEXPIRED:
case OPERATIONTIMEOUT:
retryOrThrow(retryCounter, e, "getData");
break;
@@ -293,6 +314,7 @@ public class RecoverableZooKeeper {
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
+ case SESSIONEXPIRED:
case OPERATIONTIMEOUT:
retryOrThrow(retryCounter, e, "getData");
break;
@@ -322,6 +344,7 @@ public class RecoverableZooKeeper {
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
+ case SESSIONEXPIRED:
case OPERATIONTIMEOUT:
retryOrThrow(retryCounter, e, "setData");
break;
@@ -418,6 +441,7 @@ public class RecoverableZooKeeper {
throw e;
case CONNECTIONLOSS:
+ case SESSIONEXPIRED:
case OPERATIONTIMEOUT:
retryOrThrow(retryCounter, e, "create");
break;
@@ -452,6 +476,7 @@ public class RecoverableZooKeeper {
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
+ case SESSIONEXPIRED:
case OPERATIONTIMEOUT:
retryOrThrow(retryCounter, e, "create");
break;
Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java Thu Aug 23 05:56:29 2012
@@ -243,6 +243,10 @@ public class ZooKeeperWatcher implements
return recoverableZooKeeper;
}
+ public void reconnectAfterExpiration() throws IOException, InterruptedException {
+ recoverableZooKeeper.reconnectAfterExpiration();
+ }
+
/**
* Get the quorum address of this instance.
* @return quorum string of this zookeeper connection instance
Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java Thu Aug 23 05:56:29 2012
@@ -39,6 +39,7 @@ import java.util.Map;
import java.util.NavigableSet;
import java.util.Random;
import java.util.UUID;
+import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -71,6 +72,7 @@ import org.apache.hadoop.hbase.regionser
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSUtils;
+import org.apache.hadoop.hbase.util.JVMClusterUtil;
import org.apache.hadoop.hbase.util.RegionSplitter;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.util.Writables;
@@ -84,6 +86,7 @@ import org.apache.hadoop.hdfs.MiniDFSClu
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.apache.zookeeper.ZooKeeper;
@@ -1342,7 +1345,7 @@ public class HBaseTestingUtility {
*/
public void expireMasterSession() throws Exception {
HMaster master = hbaseCluster.getMaster();
- expireSession(master.getZooKeeper(), master);
+ expireSession(master.getZooKeeper(), false);
}
/**
@@ -1352,16 +1355,22 @@ public class HBaseTestingUtility {
*/
public void expireRegionServerSession(int index) throws Exception {
HRegionServer rs = hbaseCluster.getRegionServer(index);
- expireSession(rs.getZooKeeper(), rs);
+ expireSession(rs.getZooKeeper(), false);
}
- public void expireSession(ZooKeeperWatcher nodeZK, Server server)
+ /**
+ * Expire a ZooKeeper session as recommended in ZooKeeper documentation
+ * http://wiki.apache.org/hadoop/ZooKeeper/FAQ#A4
+ * There are issues when doing this:
+ * [1] http://www.mail-archive.com/dev@zookeeper.apache.org/msg01942.html
+ * [2] https://issues.apache.org/jira/browse/ZOOKEEPER-1105
+ *
+ * @param nodeZK - the ZK to make expiry
+ * @param checkStatus - true to check if the we can create a HTable with the
+ * current configuration.
+ */
+ public void expireSession(ZooKeeperWatcher nodeZK, boolean checkStatus)
throws Exception {
- expireSession(nodeZK, server, false);
- }
-
- public void expireSession(ZooKeeperWatcher nodeZK, Server server,
- boolean checkStatus) throws Exception {
Configuration c = new Configuration(this.conf);
String quorumServers = ZKConfig.getZKQuorumServersString(c);
int sessionTimeout = 500;
@@ -1369,14 +1378,29 @@ public class HBaseTestingUtility {
byte[] password = zk.getSessionPasswd();
long sessionID = zk.getSessionId();
+ // Expiry seems to be asynchronous (see comment from P. Hunt in [1]),
+ // so we create a first watcher to be sure that the
+ // event was sent. We expect that if our watcher receives the event
+ // other watchers on the same machine will get is as well.
+ // When we ask to close the connection, ZK does not close it before
+ // we receive all the events, so don't have to capture the event, just
+ // closing the connection should be enough.
+ ZooKeeper monitor = new ZooKeeper(quorumServers,
+ 1000, new org.apache.zookeeper.Watcher(){
+ @Override
+ public void process(WatchedEvent watchedEvent) {
+ LOG.info("Monitor ZKW received event="+watchedEvent);
+ }
+ } , sessionID, password);
+
+ // Making it expire
ZooKeeper newZK = new ZooKeeper(quorumServers,
sessionTimeout, EmptyWatcher.instance, sessionID, password);
newZK.close();
- final long sleep = 7000; // 7s seems enough to manage the timeout
- LOG.info("ZK Closed Session 0x" + Long.toHexString(sessionID) +
- "; sleeping=" + sleep);
+ LOG.info("ZK Closed Session 0x" + Long.toHexString(sessionID));
- Thread.sleep(sleep);
+ // Now closing & waiting to be sure that the clients get it.
+ monitor.close();
if (checkStatus) {
new HTable(new Configuration(conf), HConstants.META_TABLE_NAME).close();
@@ -1545,7 +1569,7 @@ public class HBaseTestingUtility {
* Make sure that at least the specified number of region servers
* are running
* @param num minimum number of region servers that should be running
- * @return True if we started some servers
+ * @return true if we started some servers
* @throws IOException
*/
public boolean ensureSomeRegionServersAvailable(final int num)
@@ -1561,6 +1585,31 @@ public class HBaseTestingUtility {
}
+ /**
+ * Make sure that at least the specified number of region servers
+ * are running. We don't count the ones that are currently stopping or are
+ * stopped.
+ * @param num minimum number of region servers that should be running
+ * @return true if we started some servers
+ * @throws IOException
+ */
+ public boolean ensureSomeNonStoppedRegionServersAvailable(final int num)
+ throws IOException {
+ boolean startedServer = ensureSomeRegionServersAvailable(num);
+
+ for (JVMClusterUtil.RegionServerThread rst :
+ hbaseCluster.getRegionServerThreads()) {
+
+ HRegionServer hrs = rst.getRegionServer();
+ if (hrs.isStopping() || hrs.isStopped()) {
+ LOG.info("A region server is stopped or stopping:"+hrs);
+ LOG.info("Started new server=" + hbaseCluster.startRegionServer());
+ startedServer = true;
+ }
+ }
+
+ return startedServer;
+ }
/**
Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java Thu Aug 23 05:56:29 2012
@@ -26,6 +26,8 @@ import static org.junit.Assert.assertNul
import static org.junit.Assert.fail;
import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.ThreadPoolExecutor;
@@ -40,6 +42,7 @@ import org.apache.hadoop.hbase.client.HC
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.zookeeper.ZKConfig;
@@ -97,6 +100,15 @@ public class TestZooKeeper {
TEST_UTIL.ensureSomeRegionServersAvailable(2);
}
+ private ZooKeeperWatcher getZooKeeperWatcher(HConnection c) throws
+ NoSuchMethodException, InvocationTargetException, IllegalAccessException {
+
+ Method getterZK = c.getClass().getMethod("getKeepAliveZooKeeperWatcher");
+ getterZK.setAccessible(true);
+
+ return (ZooKeeperWatcher) getterZK.invoke(c);
+ }
+
/**
* See HBASE-1232 and http://wiki.apache.org/hadoop/ZooKeeper/FAQ#4.
* @throws IOException
@@ -111,7 +123,7 @@ public class TestZooKeeper {
new HTable(c, HConstants.META_TABLE_NAME).close();
HConnection connection = HConnectionManager.getConnection(c);
ZooKeeperWatcher connectionZK = connection.getZooKeeperWatcher();
- TEST_UTIL.expireSession(connectionZK, null);
+ TEST_UTIL.expireSession(connectionZK, false);
// provoke session expiration by doing something with ZK
ZKUtil.dump(connectionZK);
@@ -350,6 +362,21 @@ public class TestZooKeeper {
ZKUtil.getChildDataAndWatchForNewChildren(zkw, "/wrongNode");
}
+ /**
+ * Master recovery when the znode already exists. Internally, this
+ * test differs from {@link #testMasterSessionExpired} because here
+ * the master znode will exist in ZK.
+ */
+ @Test(timeout=20000)
+ public void testMasterZKSessionRecoveryFailure() throws Exception {
+ MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+ HMaster m = cluster.getMaster();
+ m.abort("Test recovery from zk session expired",
+ new KeeperException.SessionExpiredException());
+ assertFalse(m.isStopped());
+ testSanity();
+ }
+
@org.junit.Rule
public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu =
new org.apache.hadoop.hbase.ResourceCheckerJUnitRule();
Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java Thu Aug 23 05:56:29 2012
@@ -87,6 +87,8 @@ public class TestDistributedLogSplitting
LOG.info("Starting cluster");
conf = HBaseConfiguration.create();
conf.getLong("hbase.splitlog.max.resubmit", 0);
+ // Make the failure test faster
+ conf.setInt("zookeeper.recovery.retry", 0);
TEST_UTIL = new HBaseTestingUtility(conf);
TEST_UTIL.startMiniCluster(NUM_MASTERS, num_rs);
cluster = TEST_UTIL.getHBaseCluster();
@@ -245,7 +247,7 @@ public class TestDistributedLogSplitting
slm.enqueueSplitTask(logfiles[0].getPath().toString(), batch);
//waitForCounter but for one of the 2 counters
long curt = System.currentTimeMillis();
- long waitTime = 30000;
+ long waitTime = 80000;
long endt = curt + waitTime;
while (curt < endt) {
if ((tot_wkr_task_resigned.get() + tot_wkr_task_err.get() +
Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java Thu Aug 23 05:56:29 2012
@@ -80,40 +80,6 @@ public class TestMasterZKSessionRecovery
}
/**
- * Negative test of master recovery from zk session expiry.
- * <p>
- * Starts with one master. Fakes the master zk session expired.
- * The master should be able to come up if he is able to create
- * the node as active master.
- * @throws Exception
- */
- @Test(timeout=10000)
- public void testMasterZKSessionRecoveryFailure() throws Exception {
- MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
- HMaster m = cluster.getMaster();
- m.abort("Test recovery from zk session expired",
- new KeeperException.SessionExpiredException());
- assertFalse(m.isStopped());
- }
-
- /**
- * Positive test of master recovery from zk session expiry.
- * <p>
- * Starts with one master. Closes the master zk session.
- * Ensures the master can recover the expired zk session.
- * @throws Exception
- */
- @Test(timeout=60000)
- public void testMasterZKSessionRecoverySuccess() throws Exception {
- MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
- HMaster m = cluster.getMaster();
- m.getZooKeeperWatcher().close();
- m.abort("Test recovery from zk session expired",
- new KeeperException.SessionExpiredException());
- assertFalse(m.isStopped());
- }
-
- /**
* Tests that the master does not call retainAssignment after recovery from
* expired zookeeper session. Without the HBASE-6046 fix master always tries
* to assign all the user regions by calling retainAssignment.
Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java Thu Aug 23 05:56:29 2012
@@ -20,6 +20,7 @@
package org.apache.hadoop.hbase.regionserver;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNotSame;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertFalse;
@@ -83,7 +84,7 @@ public class TestSplitTransactionOnClust
}
@Before public void setup() throws IOException {
- TESTING_UTIL.ensureSomeRegionServersAvailable(NB_SERVERS);
+ TESTING_UTIL.ensureSomeNonStoppedRegionServersAvailable(NB_SERVERS);
this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration());
this.cluster = TESTING_UTIL.getMiniHBaseCluster();
}
@@ -651,7 +652,10 @@ public class TestSplitTransactionOnClust
HRegionServer tableRegionServer = cluster.getRegionServer(tableRegionIndex);
if (metaRegionServer.getServerName().equals(tableRegionServer.getServerName())) {
HRegionServer hrs = getOtherRegionServer(cluster, metaRegionServer);
- LOG.info("Moving " + hri.getRegionNameAsString() + " to " +
+ assertNotNull(hrs);
+ assertNotNull(hri);
+ LOG.
+ info("Moving " + hri.getRegionNameAsString() + " to " +
hrs.getServerName() + "; metaServerIndex=" + metaServerIndex);
admin.move(hri.getEncodedNameAsBytes(),
Bytes.toBytes(hrs.getServerName().toString()));
Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java?rev=1376374&r1=1376373&r2=1376374&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java Thu Aug 23 05:56:29 2012
@@ -100,6 +100,8 @@ public class TestReplication {
conf1.setLong("replication.source.sleepforretries", 100);
conf1.setInt("hbase.regionserver.maxlogs", 10);
conf1.setLong("hbase.master.logcleaner.ttl", 10);
+ conf1.setInt("zookeeper.recovery.retry", 1);
+ conf1.setInt("zookeeper.recovery.retry.intervalmill", 10);
conf1.setBoolean(HConstants.REPLICATION_ENABLE_KEY, true);
conf1.setBoolean("dfs.support.append", true);
conf1.setLong(HConstants.THREAD_WAKE_FREQUENCY, 100);
@@ -757,9 +759,11 @@ public class TestReplication {
int lastCount = 0;
+ final long start = System.currentTimeMillis();
for (int i = 0; i < NB_RETRIES; i++) {
if (i==NB_RETRIES-1) {
- fail("Waited too much time for queueFailover replication");
+ fail("Waited too much time for queueFailover replication. " +
+ "Waited "+(System.currentTimeMillis() - start)+"ms.");
}
Scan scan2 = new Scan();
ResultScanner scanner2 = htable2.getScanner(scan2);