You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ns...@apache.org on 2011/10/11 04:02:59 UTC

svn commit: r1181373 - in /hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase: client/ master/ regionserver/ replication/ zookeeper/

Author: nspiegelberg
Date: Tue Oct 11 02:02:57 2011
New Revision: 1181373

URL: http://svn.apache.org/viewvc?rev=1181373&view=rev
Log:
Cleanup ZK usage in internal hbase branch

Summary:
We need to abort when we get session expiration.

This is a first cut, need to go back over list of what we wanted to fix.  Not
meant for commit yet.

I still need to add fix where we delete the unassigned node after close of
disabled region.

Test Plan:
I think this breaks unit tests.

DiffCamp Revision: 152879
Reviewed By: kannan
Commenters: kranganathan
CC: jgray, kannan, kranganathan, hbase@lists
Revert Plan:
OK

Modified:
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/ZKUnassignedWatcher.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeperWrapper.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java?rev=1181373&r1=1181372&r2=1181373&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java Tue Oct 11 02:02:57 2011
@@ -380,16 +380,18 @@ public class HConnectionManager {
         tries++) {
 
           try {
-            masterLocation = zk.readMasterAddressOrThrow();
+            masterLocation = zk.readMasterAddress(zk);
 
-            HMasterInterface tryMaster = (HMasterInterface)HBaseRPC.getProxy(
-                HMasterInterface.class, HBaseRPCProtocolVersion.versionID,
-                masterLocation.getInetSocketAddress(), this.conf);
-
-            if (tryMaster.isMasterRunning()) {
-              this.master = tryMaster;
-              this.masterLock.notifyAll();
-              break;
+            if (masterLocation != null) {
+              HMasterInterface tryMaster = (HMasterInterface)HBaseRPC.getProxy(
+                  HMasterInterface.class, HBaseRPCProtocolVersion.versionID,
+                  masterLocation.getInetSocketAddress(), this.conf);
+
+              if (tryMaster.isMasterRunning()) {
+                this.master = tryMaster;
+                this.masterLock.notifyAll();
+                break;
+              }
             }
 
           } catch (IOException e) {

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1181373&r1=1181372&r2=1181373&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue Oct 11 02:02:57 2011
@@ -66,13 +66,12 @@ import org.apache.hadoop.hbase.RemoteExc
 import org.apache.hadoop.hbase.TableExistsException;
 import org.apache.hadoop.hbase.client.Get;
 import org.apache.hadoop.hbase.client.HBaseAdmin;
-import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.client.MetaScanner;
-import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
+import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.client.ServerConnection;
 import org.apache.hadoop.hbase.client.ServerConnectionManager;
-import org.apache.hadoop.hbase.executor.HBaseEventHandler;
+import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
 import org.apache.hadoop.hbase.executor.HBaseExecutorService;
 import org.apache.hadoop.hbase.executor.HBaseEventHandler.HBaseEventType;
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
@@ -101,8 +100,6 @@ import org.apache.hadoop.ipc.RemoteExcep
 import org.apache.hadoop.net.DNS;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
-import org.apache.zookeeper.Watcher.Event.EventType;
-import org.apache.zookeeper.Watcher.Event.KeeperState;
 
 import com.google.common.collect.Lists;
 
@@ -1182,41 +1179,7 @@ public class HMaster extends Thread impl
    */
   @Override
   public void process(WatchedEvent event) {
-    LOG.debug("Event " + event.getType() +
-              " with state " + event.getState() +
-              " with path " + event.getPath());
-    // Master should kill itself if its session expired or if its
-    // znode was deleted manually (usually for testing purposes)
-    if(event.getState() == KeeperState.Expired ||
-      (event.getType().equals(EventType.NodeDeleted) &&
-        event.getPath().equals(this.zooKeeperWrapper.getMasterElectionZNode())) &&
-        !shutdownRequested.get()) {
-
-      LOG.info("Master lost its znode, trying to get a new one");
-
-      // Can we still be the master? If not, goodbye
-
-      zooKeeperWrapper.close();
-      try {
-        zooKeeperWrapper =
-            ZooKeeperWrapper.createInstance(conf, HMaster.class.getName());
-        zooKeeperWrapper.registerListener(this);
-        this.zkMasterAddressWatcher.setZookeeper(zooKeeperWrapper);
-        if(!this.zkMasterAddressWatcher.
-            writeAddressToZooKeeper(this.address,false)) {
-          throw new Exception("Another Master is currently active");
-        }
-
-        // we are a failed over master, reset the fact that we started the
-        // cluster
-        resetClusterStartup();
-        // Verify the cluster to see if anything happened while we were away
-        joinCluster();
-      } catch (Exception e) {
-        LOG.error("Killing master because of", e);
-        System.exit(1);
-      }
-    }
+    // no-op now
   }
 
   private static void printUsageAndExit() {

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java?rev=1181373&r1=1181372&r2=1181373&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java Tue Oct 11 02:02:57 2011
@@ -950,6 +950,7 @@ public class RegionManager {
   public void removeRegion(HRegionInfo info) {
     synchronized (this.regionsInTransition) {
       this.regionsInTransition.remove(info.getRegionNameAsString());
+      zkWrapper.deleteUnassignedRegion(info.getEncodedName());
     }
   }
 

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java?rev=1181373&r1=1181372&r2=1181373&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java Tue Oct 11 02:02:57 2011
@@ -61,20 +61,27 @@ class ZKMasterAddressWatcher implements 
   @Override
   public synchronized void process (WatchedEvent event) {
     EventType type = event.getType();
-    LOG.debug(("Got event " + type + " with path " + event.getPath()));
     if (type.equals(EventType.NodeDeleted)) {
       if (event.getPath().equals(this.zookeeper.clusterStateZNode)) {
         LOG.info("Cluster shutdown while waiting, shutting down" +
           " this master.");
         this.requestShutdown.set(true);
-      } else {
-        LOG.debug("Master address ZNode deleted, notifying waiting masters");
+      } else if (event.getPath().equals(this.zookeeper.masterElectionZNode)){
+        LOG.info("Master address ZNode deleted, notifying waiting masters");
         notifyAll();
       }
-    } else if(type.equals(EventType.NodeCreated) &&
-        event.getPath().equals(this.zookeeper.clusterStateZNode)) {
-      LOG.debug("Resetting watch on cluster state node.");
-      this.zookeeper.setClusterStateWatch();
+    } else if(type.equals(EventType.NodeCreated)) {
+      if (event.getPath().equals(this.zookeeper.clusterStateZNode)) {
+        LOG.info("Resetting watch on cluster state node.");
+        this.zookeeper.setClusterStateWatch();
+      } else if (event.getPath().equals(this.zookeeper.masterElectionZNode)) {
+        LOG.info("Master address ZNode created, check exists and reset watch");
+        if (!zookeeper.exists(zookeeper.masterElectionZNode, true)) {
+          LOG.debug("Got NodeCreated for master node but it does not exist now" +
+              ", notifying");
+          notifyAll();
+        }
+      }
     }
   }
 
@@ -83,7 +90,7 @@ class ZKMasterAddressWatcher implements 
    * blocks until the master address ZNode gets deleted.
    */
   public synchronized void waitForMasterAddressAvailability() {
-    while (zookeeper.readMasterAddress(this) != null) {
+    while (zookeeper.readMasterAddress(zookeeper) != null) {
       try {
         LOG.debug("Waiting for master address ZNode to be deleted " +
           "(Also watching cluster state node)");
@@ -112,7 +119,7 @@ class ZKMasterAddressWatcher implements 
         this.zookeeper.setClusterState(true);
         this.zookeeper.setClusterStateWatch();
         // Watch our own node
-        this.zookeeper.readMasterAddress(this);
+        this.zookeeper.readMasterAddress(zookeeper);
         return true;
       }
     } while(retry);

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/ZKUnassignedWatcher.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/ZKUnassignedWatcher.java?rev=1181373&r1=1181372&r2=1181373&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/ZKUnassignedWatcher.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/ZKUnassignedWatcher.java Tue Oct 11 02:02:57 2011
@@ -92,10 +92,6 @@ public class ZKUnassignedWatcher impleme
   @Override
   public synchronized void process(WatchedEvent event) {
     EventType type = event.getType();
-    LOG.debug("ZK-EVENT-PROCESS: Got zkEvent " + type +
-              " state:" + event.getState() +
-              " path:" + event.getPath());
-
     // Handle the ignored events
     if(type.equals(EventType.None)       ||
        type.equals(EventType.NodeDeleted)) {
@@ -109,6 +105,9 @@ public class ZKUnassignedWatcher impleme
       return;
     }
 
+    LOG.debug("ZK-EVENT-PROCESS: Got zkEvent " + type +
+              " path:" + event.getPath());
+
     try
     {
       /*

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1181373&r1=1181372&r2=1181373&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Tue Oct 11 02:02:57 2011
@@ -112,6 +112,7 @@ import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.net.DNS;
 import org.apache.hadoop.util.Progressable;
 import org.apache.hadoop.util.StringUtils;
+import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
 import org.apache.zookeeper.Watcher.Event.EventType;
@@ -322,18 +323,22 @@ public class HRegionServer implements HR
         "hbase-958 debugging");
     }
     reinitializeThreads();
-    reinitializeZooKeeper();
+    initializeZooKeeper();
     int nbBlocks = conf.getInt("hbase.regionserver.nbreservationblocks", 4);
     for(int i = 0; i < nbBlocks; i++)  {
       reservedSpace.add(new byte[HConstants.DEFAULT_SIZE_RESERVATION_BLOCK]);
     }
   }
 
-  private void reinitializeZooKeeper() throws IOException {
+  private void initializeZooKeeper() throws IOException {
     zooKeeperWrapper =
         ZooKeeperWrapper.createInstance(conf, serverInfo.getServerName());
     zooKeeperWrapper.registerListener(this);
-    watchMasterAddress();
+    try {
+      zooKeeperWrapper.watchMasterAddress(zooKeeperWrapper);
+    } catch (KeeperException e) {
+      throw new IOException(e);
+    }
   }
 
   private void reinitializeThreads() {
@@ -371,8 +376,6 @@ public class HRegionServer implements HR
   public void process(WatchedEvent event) {
     EventType type = event.getType();
     KeeperState state = event.getState();
-    LOG.info("Got ZooKeeper event, state: " + state + ", type: " +
-      type + ", path: " + event.getPath());
 
     // Ignore events if we're shutting down.
     if (this.stopRequested.get()) {
@@ -380,48 +383,34 @@ public class HRegionServer implements HR
       return;
     }
 
-    if (state == KeeperState.Expired) {
-      LOG.error("ZooKeeper session expired");
-      boolean restart =
-        this.conf.getBoolean("hbase.regionserver.restart.on.zk.expire", false);
-      if (restart) {
-        restart();
-      } else {
-        abort("ZooKeeper session expired");
-      }
-    } else if (type == EventType.NodeDeleted) {
-      watchMasterAddress();
-    } else if (type == EventType.NodeCreated) {
-      getMaster();
+    if (!event.getPath().equals(zooKeeperWrapper.masterElectionZNode)) {
+      return;
+    }
 
-      // ZooKeeper watches are one time only, so we need to re-register our watch.
-      watchMasterAddress();
+    try {
+      if (type == EventType.NodeDeleted) {
+        handleMasterNodeDeleted();
+      } else if (type == EventType.NodeCreated) {
+        handleMasterNodeCreated();
+      }
+    } catch(KeeperException ke) {
+      LOG.error("KeeperException handling master failover", ke);
+      abort("ZooKeeper exception handling master failover");
     }
   }
 
-  private void watchMasterAddress() {
-    while (!stopRequested.get() && !zooKeeperWrapper.watchMasterAddress(this)) {
-      LOG.warn("Unable to set watcher on ZooKeeper master address. Retrying.");
-      sleeper.sleep();
+  private void handleMasterNodeDeleted() throws KeeperException {
+    if(zooKeeperWrapper.watchMasterAddress(zooKeeperWrapper)) {
+      handleMasterNodeCreated();
     }
   }
 
-  private void restart() {
-    abort("Restarting region server");
-    Threads.shutdown(regionServerThread);
-    boolean done = false;
-    while (!done) {
-      try {
-        reinitialize();
-        done = true;
-      } catch (IOException e) {
-        LOG.debug("Error trying to reinitialize ZooKeeper", e);
-      }
+  private void handleMasterNodeCreated() throws KeeperException {
+    if(!zooKeeperWrapper.watchMasterAddress(zooKeeperWrapper)) {
+      handleMasterNodeDeleted();
+    } else {
+      getMaster();
     }
-    Thread t = new Thread(this);
-    String name = regionServerThread.getName();
-    t.setName(name);
-    t.start();
   }
 
   /** @return ZooKeeperWrapper used by RegionServer. */
@@ -1204,11 +1193,11 @@ public class HRegionServer implements HR
         return false;
       }
       try {
-        masterAddress = zooKeeperWrapper.readMasterAddressOrThrow();
-      } catch (IOException e) {
-        LOG.warn("Unable to read master address from ZooKeeper. Retrying." +
-                 " Error was:", e);
-        sleeper.sleep();
+        masterAddress = zooKeeperWrapper.readAddressOrThrow(
+            zooKeeperWrapper.masterElectionZNode, zooKeeperWrapper);
+      } catch (KeeperException e) {
+        LOG.warn("Unable to read master address from ZooKeeper.", e);
+        abort("Unable to read master address in ZK");
       }
     }
 

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeperWrapper.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeperWrapper.java?rev=1181373&r1=1181372&r2=1181373&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeperWrapper.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeperWrapper.java Tue Oct 11 02:02:57 2011
@@ -483,8 +483,8 @@ public class ReplicationZookeeperWrapper
     @Override
     public void process(WatchedEvent watchedEvent) {
       Event.EventType type = watchedEvent.getType();
-      LOG.info("Got event " + type + " with path " + watchedEvent.getPath());
       if (type.equals(Event.EventType.NodeDataChanged)) {
+        LOG.info("Got event " + type + " with path " + watchedEvent.getPath());
         setReplicating();
       }
     }

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java?rev=1181373&r1=1181372&r2=1181373&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java Tue Oct 11 02:02:57 2011
@@ -55,6 +55,8 @@ import org.apache.zookeeper.KeeperExcept
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
 import org.apache.zookeeper.ZooKeeper;
+import org.apache.zookeeper.Watcher.Event.EventType;
+import org.apache.zookeeper.Watcher.Event.KeeperState;
 import org.apache.zookeeper.ZooDefs.Ids;
 import org.apache.zookeeper.ZooKeeper.States;
 import org.apache.zookeeper.data.Stat;
@@ -127,7 +129,7 @@ public class ZooKeeperWrapper implements
   /*
    * ZNode used for election of the primary master when there are secondaries.
    */
-  private final String masterElectionZNode;
+  public final String masterElectionZNode;
   /*
    * State of the cluster - if up and running or shutting down
    */
@@ -191,7 +193,6 @@ public class ZooKeeperWrapper implements
                             HConstants.ZOOKEEPER_CONFIG_NAME);
     }
     sessionTimeout = conf.getInt("zookeeper.session.timeout", 60 * 1000);
-    reconnectToZk();
 
     parentZNode = conf.get(HConstants.ZOOKEEPER_ZNODE_PARENT, HConstants.DEFAULT_ZOOKEEPER_ZNODE_PARENT);
 
@@ -206,26 +207,47 @@ public class ZooKeeperWrapper implements
     rgnsInTransitZNode  = getZNode(parentZNode, regionsInTransitZNodeName);
     masterElectionZNode = getZNode(parentZNode, masterAddressZNodeName);
     clusterStateZNode   = getZNode(parentZNode, stateZNodeName);
+
+    connectToZk();
   }
 
-  public void reconnectToZk() throws IOException {
+  public void connectToZk() throws IOException {
     try {
-      LOG.info("Reconnecting to zookeeper");
+      LOG.info("Connecting to zookeeper");
       if(zooKeeper != null) {
         zooKeeper.close();
-        LOG.debug("<" + instanceName + ">" + "Closed existing zookeeper client");
+        LOG.error("<" + instanceName + ">" + " Closed existing zookeeper client");
       }
       zooKeeper = new ZooKeeper(quorumServers, sessionTimeout, this);
-      LOG.debug("<" + instanceName + ">" + "Connected to zookeeper again");
+      LOG.debug("<" + instanceName + ">" + " Connected to zookeeper");
+      // Ensure we are actually connected
+      ensureZkAvailable(10);
     } catch (IOException e) {
-      LOG.error("<" + instanceName + ">" + "Failed to create ZooKeeper object: " + e);
+      LOG.error("<" + instanceName + "> " + "Failed to create ZooKeeper object: " + e);
       throw new IOException(e);
     } catch (InterruptedException e) {
-      LOG.error("<" + instanceName + ">" + "Error closing ZK connection: " + e);
+      LOG.error("<" + instanceName + " >" + "Error closing ZK connection: " + e);
       throw new IOException(e);
     }
   }
 
+  private void ensureZkAvailable(int maxRetries)
+  throws IOException, InterruptedException {
+    while (maxRetries-- > 0) {
+      try {
+        zooKeeper.exists(parentZNode, false);
+        return;
+      } catch(KeeperException.ConnectionLossException cle) {
+        LOG.info("Received ZK ConnectionLossException, ZK not done initializing"
+            + ", retrying connect to ZK after 1 second sleep");
+        Thread.sleep(1000);
+      } catch(KeeperException ke) {
+        LOG.error("Received abnormal ZK exception, aborting", ke);
+        throw new IOException(ke);
+      }
+    }
+  }
+
   public synchronized void registerListener(Watcher watcher) {
     listeners.add(watcher);
   }
@@ -240,11 +262,27 @@ public class ZooKeeperWrapper implements
    */
   @Override
   public synchronized void process(WatchedEvent event) {
+    LOG.debug("<" + instanceName + "> Received ZK WatchedEvent: " +
+        "[path=" + event.getPath() + "] " +
+        "[state=" + event.getState().toString() + "] " +
+        "[type=" + event.getType().toString() + "]");
+    if (event.getType() == EventType.None) {
+      if (event.getState() == KeeperState.Expired) {
+        LOG.error("ZooKeeper Session Expiration, aborting server");
+        abort();
+      } else if (event.getState() == KeeperState.Disconnected) {
+        LOG.warn("Disconnected from ZooKeeper");
+      } else if (event.getState() == KeeperState.SyncConnected) {
+        LOG.info("Reconnected to ZooKeeper");
+      }
+      return;
+    }
     for(Watcher w : listeners) {
       try {
         w.process(event);
       } catch (Throwable t) {
-        LOG.error("<"+instanceName+">" + "ZK updates listener threw an exception in process()", t);
+        LOG.error("<"+instanceName+">" + " Sub-ZK Watcher threw an exception " +
+            "in process()", t);
       }
     }
   }
@@ -386,26 +424,20 @@ public class ZooKeeperWrapper implements
     return res.toArray(new String[res.size()]);
   }
 
+  /**
+   * Check if the specified znode exists.  Set a watch if boolean is true,
+   * whether or not the node exists.
+   * @param znode
+   * @param watch
+   * @return
+   */
   public boolean exists(String znode, boolean watch) {
     try {
-      return zooKeeper.exists(getZNode(parentZNode, znode), watch?this:null) != null;
-    } catch (KeeperException.SessionExpiredException e) {
-      // if the session has expired try to reconnect to ZK, then perform query
-      try {
-        // TODO: ZK-REFACTOR: We should not reconnect - we should just quit and restart.
-        reconnectToZk();
-        return zooKeeper.exists(getZNode(parentZNode, znode), watch?this:null) != null;
-      } catch (IOException e1) {
-        LOG.error("Error reconnecting to zookeeper", e1);
-        throw new RuntimeException("Error reconnecting to zookeeper", e1);
-      } catch (KeeperException e1) {
-        LOG.error("Error reading after reconnecting to zookeeper", e1);
-        throw new RuntimeException("Error reading after reconnecting to zookeeper", e1);
-      } catch (InterruptedException e1) {
-        LOG.error("Error reading after reconnecting to zookeeper", e1);
-        throw new RuntimeException("Error reading after reconnecting to zookeeper", e1);
-      }
+      return zooKeeper.exists(getZNode(parentZNode, znode), watch ? this : null)
+             != null;
     } catch (KeeperException e) {
+      LOG.error("Received KeeperException on exists() call, aborting", e);
+      abort();
       return false;
     } catch (InterruptedException e) {
       return false;
@@ -455,15 +487,6 @@ public class ZooKeeperWrapper implements
   }
 
   /**
-   * Read address of master server.
-   * @return HServerAddress of master server.
-   * @throws IOException if there's a problem reading the ZNode.
-   */
-  public HServerAddress readMasterAddressOrThrow() throws IOException {
-    return readAddressOrThrow(masterElectionZNode, null);
-  }
-
-  /**
    * Read master address and set a watch on it.
    * @param watcher Watcher to set on master address ZNode if not null.
    * @return HServerAddress of master or null if there was a problem reading the
@@ -521,23 +544,26 @@ public class ZooKeeperWrapper implements
   }
 
   /**
-   * Set a watcher on the master address ZNode. The watcher will be set unless
-   * an exception occurs with ZooKeeper.
+   * Set a watcher on the master address ZNode whether or not the node currently
+   * exists. The watcher will always be set unless this method throws an
+   * exception.  Method will return true if node existed when watch was set,
+   * false if not.
    * @param watcher Watcher to set on master address ZNode.
-   * @return true if watcher was set, false otherwise.
+   * @return true if node exists when watch set, false if not
    */
-  public boolean watchMasterAddress(Watcher watcher) {
+  public boolean watchMasterAddress(Watcher watcher)
+  throws KeeperException {
     try {
-      zooKeeper.exists(masterElectionZNode, watcher);
+      Stat s = zooKeeper.exists(masterElectionZNode, watcher);
+      LOG.debug("<" + instanceName + ">" + " Set watcher on master address ZNode " + masterElectionZNode);
+      return s != null;
     } catch (KeeperException e) {
-      LOG.warn("<" + instanceName + ">" + "Failed to set watcher on ZNode " + masterElectionZNode, e);
-      return false;
+      LOG.warn("<" + instanceName + ">" + " Failed to set watcher on ZNode " + masterElectionZNode, e);
+      throw e;
     } catch (InterruptedException e) {
-      LOG.warn("<" + instanceName + ">" + "Failed to set watcher on ZNode " + masterElectionZNode, e);
+      LOG.warn("<" + instanceName + ">" + " Failed to set watcher on ZNode " + masterElectionZNode, e);
       return false;
     }
-    LOG.debug("<" + instanceName + ">" + "Set watcher on master address ZNode " + masterElectionZNode);
-    return true;
   }
 
   /**
@@ -551,20 +577,30 @@ public class ZooKeeperWrapper implements
     try {
       LOG.debug("<" + instanceName + ">" + "Trying to read " + znode);
       return readAddressOrThrow(znode, watcher);
-    } catch (IOException e) {
+    } catch (KeeperException e) {
       LOG.debug("<" + instanceName + ">" + "Failed to read " + e.getMessage());
       return null;
     }
   }
 
-  private HServerAddress readAddressOrThrow(String znode, Watcher watcher) throws IOException {
+  /**
+   * Reads the specified address from the specified zk node, setting the
+   * specified watcher.  Returns null if the node does not exist.
+   * @param znode
+   * @param watcher
+   * @return
+   * @throws KeeperException
+   */
+  public HServerAddress readAddressOrThrow(String znode, Watcher watcher)
+  throws KeeperException {
     byte[] data;
     try {
       data = zooKeeper.getData(znode, watcher, null);
     } catch (InterruptedException e) {
-      throw new IOException(e);
-    } catch (KeeperException e) {
-      throw new IOException(e);
+      // This should not happen
+      return null;
+    } catch (KeeperException.NoNodeException e) {
+      return null;
     }
 
     String addressString = Bytes.toString(data);
@@ -1110,18 +1146,21 @@ public class ZooKeeperWrapper implements
       // create the znode
       zooKeeper.create(fullyQualifiedZNodeName, data, Ids.OPEN_ACL_UNSAFE, createMode);
       LOG.debug("<" + instanceName + ">" + "Created ZNode " + fullyQualifiedZNodeName + " in ZooKeeper");
-      // watch the znode for deletion, data change, creation of children
-      if(watch) {
-        watchZNode(zNodeName);
-      }
-      return fullyQualifiedZNodeName;
+    } catch (KeeperException.NodeExistsException nee) {
+      LOG.debug("<" + instanceName + "> " + "ZNode " + fullyQualifiedZNodeName + " already exists, still setting watch");
     } catch (InterruptedException e) {
       LOG.warn("<" + instanceName + ">" + "Failed to create ZNode " + fullyQualifiedZNodeName + " in ZooKeeper", e);
+      return null;
     } catch (KeeperException e) {
-      LOG.warn("<" + instanceName + ">" + "Failed to create ZNode " + fullyQualifiedZNodeName + " in ZooKeeper", e);
+      LOG.error("<" + instanceName + ">" + "Failed to create ZNode " + fullyQualifiedZNodeName + " in ZooKeeper", e);
+      return null;
     }
+      // watch the znode for deletion, data change, creation of children
+      if(watch) {
+        watchZNode(zNodeName);
+      }
 
-    return null;
+    return fullyQualifiedZNodeName;
   }
 
   public byte[] readZNode(String znodeName, Stat stat) throws IOException {
@@ -1281,29 +1320,11 @@ public class ZooKeeperWrapper implements
         unassignedZNodesWatched.remove(znode);
         deleteZNode(znode);
       }
-    } catch (KeeperException.SessionExpiredException e) {
-      LOG.error("Zookeeper session has expired", e);
-      // if the session has expired try to reconnect to ZK, then perform query
-      try {
-        // TODO: ZK-REFACTOR: should just quit on reconnect??
-        reconnectToZk();
-        synchronized(unassignedZNodesWatched) {
-          unassignedZNodesWatched.remove(znode);
-          deleteZNode(znode);
-        }
-      } catch (IOException e1) {
-        LOG.error("Error reconnecting to zookeeper", e1);
-        throw new RuntimeException("Error reconnecting to zookeeper", e1);
-      } catch (KeeperException.SessionExpiredException e1) {
-        LOG.error("Error reading after reconnecting to zookeeper", e1);
-        throw new RuntimeException("Error reading after reconnecting to zookeeper", e1);
-      } catch (KeeperException e1) {
-        LOG.error("Error reading after reconnecting to zookeeper", e1);
-      } catch (InterruptedException e1) {
-        LOG.error("Error reading after reconnecting to zookeeper", e1);
-      }
+    } catch (KeeperException.NoNodeException e) {
+      LOG.warn("Attempted to delete an unassigned region node but it DNE");
     } catch (KeeperException e) {
       LOG.error("Error deleting region " + regionName, e);
+      abort();
     } catch (InterruptedException e) {
       LOG.error("Error deleting region " + regionName, e);
     }
@@ -1358,4 +1379,9 @@ public class ZooKeeperWrapper implements
     }
 
   }
+
+  private void abort() {
+    LOG.fatal("<" + instanceName + "> Aborting process because of fatal ZK error");
+    System.exit(1);
+  }
 }