You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2010/10/09 19:55:34 UTC
svn commit: r1006202 - in /hbase/trunk: CHANGES.txt
src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
Author: stack
Date: Sat Oct 9 17:55:33 2010
New Revision: 1006202
URL: http://svn.apache.org/viewvc?rev=1006202&view=rev
Log:
HBASE-3062 ZooKeeper KeeperException is a recoverable exception; we should retry a while on server startup at least.
Modified:
hbase/trunk/CHANGES.txt
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
Modified: hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1006202&r1=1006201&r2=1006202&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Sat Oct 9 17:55:33 2010
@@ -571,6 +571,9 @@ Release 0.21.0 - Unreleased
HBASE-3008 Memstore.updateColumnValue passes wrong flag to heapSizeChange
(Causes memstore size to go negative)
HBASE-3089 REST tests are broken locally and up in hudson
+ HBASE-3062 ZooKeeper KeeperException$ConnectionLossException is a
+ "recoverable" exception; we should retry a while on server
+ startup at least.
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1006202&r1=1006201&r2=1006202&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Sat Oct 9 17:55:33 2010
@@ -402,11 +402,17 @@ public class HRegionServer implements HR
* @throws InterruptedException
*/
private void initialize() throws IOException, InterruptedException {
- initializeZooKeeper();
- initializeThreads();
- int nbBlocks = conf.getInt("hbase.regionserver.nbreservationblocks", 4);
- for (int i = 0; i < nbBlocks; i++) {
- reservedSpace.add(new byte[HConstants.DEFAULT_SIZE_RESERVATION_BLOCK]);
+ try {
+ initializeZooKeeper();
+ initializeThreads();
+ int nbBlocks = conf.getInt("hbase.regionserver.nbreservationblocks", 4);
+ for (int i = 0; i < nbBlocks; i++) {
+ reservedSpace.add(new byte[HConstants.DEFAULT_SIZE_RESERVATION_BLOCK]);
+ }
+ } catch (Throwable t) {
+ // Call stop if error or process will stick around for ever since server
+ // puts up non-daemon threads.
+ this.server.stop();
}
}
@@ -2465,4 +2471,4 @@ public class HRegionServer implements HR
new HRegionServerCommandLine(regionServerClass).doMain(args);
}
-}
\ No newline at end of file
+}
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java?rev=1006202&r1=1006201&r2=1006202&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java Sat Oct 9 17:55:33 2010
@@ -104,7 +104,30 @@ public class ZooKeeperWatcher implements
try {
// Create all the necessary "directories" of znodes
// TODO: Move this to an init method somewhere so not everyone calls it?
- ZKUtil.createAndFailSilent(this, baseZNode);
+
+ // The first call against zk can fail with connection loss. Seems common.
+ // Apparently this is recoverable. Retry a while.
+ // See http://wiki.apache.org/hadoop/ZooKeeper/ErrorHandling
+ // TODO: Generalize out in ZKUtil.
+ long wait = conf.getLong("hbase.zookeeper.recoverable.waittime", 10000);
+ long finished = System.currentTimeMillis() + wait;
+ KeeperException ke = null;
+ do {
+ try {
+ ZKUtil.createAndFailSilent(this, baseZNode);
+ ke = null;
+ break;
+ } catch (KeeperException.ConnectionLossException e) {
+ if (LOG.isDebugEnabled() && (isFinishedRetryingRecoverable(finished))) {
+ LOG.debug("Retrying zk create for another " +
+ (finished - System.currentTimeMillis()) +
+ "ms; set 'hbase.zookeeper.recoverable.waittime' to change " +
+ "wait time); " + e.getMessage());
+ }
+ ke = e;
+ }
+ } while (isFinishedRetryingRecoverable(finished));
+ if (ke != null) throw ke;
ZKUtil.createAndFailSilent(this, assignmentZNode);
ZKUtil.createAndFailSilent(this, rsZNode);
ZKUtil.createAndFailSilent(this, tableZNode);
@@ -114,6 +137,10 @@ public class ZooKeeperWatcher implements
}
}
+ private boolean isFinishedRetryingRecoverable(final long finished) {
+ return System.currentTimeMillis() < finished;
+ }
+
@Override
public String toString() {
return this.identifier;