You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by mb...@apache.org on 2012/06/02 08:27:45 UTC

svn commit: r1345452 - in /hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase: master/HMaster.java zookeeper/ZooKeeperWrapper.java

Author: mbautin
Date: Sat Jun  2 06:27:45 2012
New Revision: 1345452

URL: http://svn.apache.org/viewvc?rev=1345452&view=rev
Log:
[master] [0.89-fb] fix setUnassigned to abort the master upon zk failures

Author: aaiyer

Summary:
fix setUnassigned to abort on zk failures

In theory, any operation that contacts ZK can throw an exception (Say, the zk nodes were un reachable or something.)

Currently, the master catches these exceptions, logs and error and continues silently. This is wrong.

We have been discussing how to handle these exceptions:
 Approach 1) was to handle these exceptions and retry.
 Approach 2) is to kill the master itself.

We are going with approach 2 here, because we do not expect ZKWrapper to have intermittent failures. Intermittent failures (connection/socket etc.) are already retried by the ZKWrapper. If there is an exception that percolates above, then it means that there is something wrong and the master is unable to contact the ZK.

If the master cannot talk to ZK, it is better to fast kill the master, than try to handle retries. (The master is going to die regardless, in a few seconds after it fails to keep its ZNode alive)

Test Plan: TBD: test on MR

Reviewers: kannan, pkhemani

Reviewed By: kannan

CC: hbase-eng@, kranganathan

Differential Revision: https://phabricator.fb.com/D482166

Modified:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1345452&r1=1345451&r2=1345452&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Sat Jun  2 06:27:45 2012
@@ -445,7 +445,7 @@ public class HMaster extends Thread impl
         HConstants.ZOOKEEPER_SESSION_EXPIRED_ABORT_PROCESS, true);
     // Set this property to set zk session timeout for master which is different
     // from what region servers use. The master's zk session timeout can be
-    // much shorter than region server's. It is easier to recycle master becuase
+    // much shorter than region server's. It is easier to recycle master because
     // it doesn't handle data. The region server can have an inflated zk session
     // timeout because they also rely on master to kill them if they miss any
     // heartbeat
@@ -1364,7 +1364,7 @@ public class HMaster extends Thread impl
       throw e;
     } catch (IOException e) {
       LOG.error("Cannot create table " + desc.getNameAsString() + 
-				" because of " + e.toString());
+        " because of " + e.toString());
       throw RemoteExceptionHandler.checkIOException(e);
     }
   }
@@ -1603,12 +1603,12 @@ public class HMaster extends Thread impl
     }
     else {
       List<MetaRegion> metaRegions = regionManager.getListOfOnlineMetaRegions();
-	for (MetaRegion mRegion: metaRegions) {
-		if (Bytes.equals(mRegion.getRegionInfo().getTableDesc().getName(), tableName)) {
-			result.add(new Pair<HRegionInfo, HServerAddress>
+      for (MetaRegion mRegion: metaRegions) {
+        if (Bytes.equals(mRegion.getRegionInfo().getTableDesc().getName(), tableName)) {
+          result.add(new Pair<HRegionInfo, HServerAddress>
               (mRegion.getRegionInfo(), mRegion.getServer()));
-		}
-	}
+        }
+      }
     }
     return result;
   }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java?rev=1345452&r1=1345451&r2=1345452&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java Sat Jun  2 06:27:45 2012
@@ -1413,6 +1413,7 @@ public class ZooKeeperWrapper implements
           oldData = readZNode(znode, stat);
         } catch (IOException e) {
           LOG.error("Error reading data for " + znode);
+          abort("Error reading data for " + znode, e);
         }
         if(oldData == null) {
           LOG.debug("While creating UNASSIGNED region " + regionName + " exists with no data" );
@@ -1458,6 +1459,7 @@ public class ZooKeeperWrapper implements
       oldData = readZNode(znode, stat);
     } catch (IOException e) {
       LOG.error("Error reading data for " + znode);
+      abort("Error reading data for " + znode, e);
     }
     // If there is no data in the ZNode, then update it
     if(oldData == null) {
@@ -1488,6 +1490,7 @@ public class ZooKeeperWrapper implements
         writeZNode(znode, data, -1, true);
       } catch (IOException e) {
         LOG.error("Error writing data for " + znode + ", could not update state to " + (HBaseEventType.fromByte(data[0])));
+        abort("Error writing data for " + znode, e);
       }
     }
   }
@@ -1878,7 +1881,7 @@ public class ZooKeeperWrapper implements
    */
   private void abort(String why, Throwable e) {
     LOG.error("<" + instanceName + "> is going to abort " +
-		"because " + why);
+      "because " + why);
     this.abortable.abort(why, e);
   }