You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2010/10/09 19:55:34 UTC

svn commit: r1006202 - in /hbase/trunk: CHANGES.txt src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java

Author: stack
Date: Sat Oct  9 17:55:33 2010
New Revision: 1006202

URL: http://svn.apache.org/viewvc?rev=1006202&view=rev
Log:
HBASE-3062 ZooKeeper KeeperException is a recoverable exception; we should retry a while on server startup at least.

Modified:
    hbase/trunk/CHANGES.txt
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java

Modified: hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1006202&r1=1006201&r2=1006202&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Sat Oct  9 17:55:33 2010
@@ -571,6 +571,9 @@ Release 0.21.0 - Unreleased
    HBASE-3008  Memstore.updateColumnValue passes wrong flag to heapSizeChange
                (Causes memstore size to go negative)
    HBASE-3089  REST tests are broken locally and up in hudson
+   HBASE-3062  ZooKeeper KeeperException$ConnectionLossException is a
+               "recoverable" exception; we should retry a while on server
+               startup at least.
   
 
 

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1006202&r1=1006201&r2=1006202&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Sat Oct  9 17:55:33 2010
@@ -402,11 +402,17 @@ public class HRegionServer implements HR
    * @throws InterruptedException
    */
   private void initialize() throws IOException, InterruptedException {
-    initializeZooKeeper();
-    initializeThreads();
-    int nbBlocks = conf.getInt("hbase.regionserver.nbreservationblocks", 4);
-    for (int i = 0; i < nbBlocks; i++) {
-      reservedSpace.add(new byte[HConstants.DEFAULT_SIZE_RESERVATION_BLOCK]);
+    try {
+      initializeZooKeeper();
+      initializeThreads();
+      int nbBlocks = conf.getInt("hbase.regionserver.nbreservationblocks", 4);
+      for (int i = 0; i < nbBlocks; i++) {
+        reservedSpace.add(new byte[HConstants.DEFAULT_SIZE_RESERVATION_BLOCK]);
+      }
+    } catch (Throwable t) {
+      // Call stop if error or process will stick around for ever since server
+      // puts up non-daemon threads.
+      this.server.stop();
     }
   }
 
@@ -2465,4 +2471,4 @@ public class HRegionServer implements HR
 
     new HRegionServerCommandLine(regionServerClass).doMain(args);
   }
-}
\ No newline at end of file
+}

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java?rev=1006202&r1=1006201&r2=1006202&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java Sat Oct  9 17:55:33 2010
@@ -104,7 +104,30 @@ public class ZooKeeperWatcher implements
     try {
       // Create all the necessary "directories" of znodes
       // TODO: Move this to an init method somewhere so not everyone calls it?
-      ZKUtil.createAndFailSilent(this, baseZNode);
+
+      // The first call against zk can fail with connection loss.  Seems common.
+      // Apparently this is recoverable.  Retry a while.
+      // See http://wiki.apache.org/hadoop/ZooKeeper/ErrorHandling
+      // TODO: Generalize out in ZKUtil.
+      long wait = conf.getLong("hbase.zookeeper.recoverable.waittime", 10000);
+      long finished = System.currentTimeMillis() + wait;
+      KeeperException ke = null;
+      do {
+        try {
+          ZKUtil.createAndFailSilent(this, baseZNode);
+          ke = null;
+          break;
+        } catch (KeeperException.ConnectionLossException e) {
+          if (LOG.isDebugEnabled() && (isFinishedRetryingRecoverable(finished))) {
+            LOG.debug("Retrying zk create for another " +
+              (finished - System.currentTimeMillis()) +
+              "ms; set 'hbase.zookeeper.recoverable.waittime' to change " +
+              "wait time); " + e.getMessage());
+          }
+          ke = e;
+        }
+      } while (isFinishedRetryingRecoverable(finished));
+      if (ke != null) throw ke;
       ZKUtil.createAndFailSilent(this, assignmentZNode);
       ZKUtil.createAndFailSilent(this, rsZNode);
       ZKUtil.createAndFailSilent(this, tableZNode);
@@ -114,6 +137,10 @@ public class ZooKeeperWatcher implements
     }
   }
 
+  private boolean isFinishedRetryingRecoverable(final long finished) {
+    return System.currentTimeMillis() < finished;
+  }
+
   @Override
   public String toString() {
     return this.identifier;