You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by je...@apache.org on 2013/06/04 23:05:48 UTC
svn commit: r1489609 - in /hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master: HMaster.java MasterFileSystem.java SplitLogManager.java

Author: jeffreyz
Date: Tue Jun  4 21:05:48 2013
New Revision: 1489609

URL: http://svn.apache.org/r1489609
Log:
hbase-8666: META region isn't fully recovered during master initialization when META region recovery had chained failures

Modified:
    hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
    hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java

Modified: hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1489609&r1=1489608&r2=1489609&view=diff
==============================================================================
--- hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue Jun  4 21:05:48 2013
@@ -795,6 +795,7 @@ MasterServices, Server {
       // Note: we can't remove oldMetaServerLocation from previousFailedServers list because it
       // may also host user regions
     }
+    Set<ServerName> previouslyFailedMetaRSs = getPreviouselyFailedMetaServersFromZK();
 
     this.initializationBeforeMetaAssignment = true;
     // Make sure meta assigned before proceeding.
@@ -804,11 +805,19 @@ MasterServices, Server {
     // assigned when master is shutting down
     if(this.stopped) return;
 
-    if (this.distributedLogReplay && oldMetaServerLocation != null
-        && previouslyFailedServers.contains(oldMetaServerLocation)) {
+    if (this.distributedLogReplay && (!previouslyFailedMetaRSs.isEmpty())) {
       // replay WAL edits mode need new .META. RS is assigned firstly
       status.setStatus("replaying log for Meta Region");
-      this.fileSystemManager.splitMetaLog(oldMetaServerLocation);
+      // need to use union of previouslyFailedMetaRSs recorded in ZK and previouslyFailedServers
+      // instead of oldMetaServerLocation to address the following two situations:
+      // 1) the chained failure situation(recovery failed multiple times in a row).
+      // 2) master get killed right before it could delete the recovering META from ZK while the
+      // same server still has non-meta wals to be replayed so that
+      // removeStaleRecoveringRegionsFromZK can't delete the stale META region
+      // Passing more servers into splitMetaLog is all right. If a server doesn't have .META. wal,
+      // there is no op for the server.
+      previouslyFailedMetaRSs.addAll(previouslyFailedServers);
+      this.fileSystemManager.splitMetaLog(previouslyFailedMetaRSs);
     }
 
     enableServerShutdownHandler();
@@ -992,6 +1001,25 @@ MasterServices, Server {
     return true;
   }
 
+  /**
+   * This function returns a set of region server names under .META. recovering region ZK node
+   * @return Set of meta server names which were recorded in ZK
+   * @throws KeeperException
+   */
+  private Set<ServerName> getPreviouselyFailedMetaServersFromZK() throws KeeperException {
+    Set<ServerName> result = new HashSet<ServerName>();
+    String metaRecoveringZNode = ZKUtil.joinZNode(zooKeeper.recoveringRegionsZNode,
+      HRegionInfo.FIRST_META_REGIONINFO.getEncodedName());
+    List<String> regionFailedServers = ZKUtil.listChildrenNoWatch(zooKeeper, metaRecoveringZNode);
+    if (regionFailedServers == null) return result;
+
+    for(String failedServer : regionFailedServers) {
+      ServerName server = ServerName.parseServerName(failedServer);
+      result.add(server);
+    }
+    return result;
+  }
+
   @Override
   public TableDescriptors getTableDescriptors() {
     return this.tableDescriptors;

Modified: hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java?rev=1489609&r1=1489608&r2=1489609&view=diff
==============================================================================
--- hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (original)
+++ hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java Tue Jun  4 21:05:48 2013
@@ -294,9 +294,18 @@ public class MasterFileSystem {
    * @throws IOException
    */
   public void splitMetaLog(final ServerName serverName) throws IOException {
-    long splitTime = 0, splitLogSize = 0;
     Set<ServerName> serverNames = new HashSet<ServerName>();
     serverNames.add(serverName);
+    splitMetaLog(serverNames);
+  }
+
+  /**
+   * Specialized method to handle the splitting for meta HLog
+   * @param serverNames
+   * @throws IOException
+   */
+  public void splitMetaLog(final Set<ServerName> serverNames) throws IOException {
+    long splitTime = 0, splitLogSize = 0;
     List<Path> logDirs = getLogDirs(serverNames);
 
     splitLogManager.handleDeadWorkers(serverNames);

Modified: hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java?rev=1489609&r1=1489608&r2=1489609&view=diff
==============================================================================
--- hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java (original)
+++ hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java Tue Jun  4 21:05:48 2013
@@ -124,7 +124,7 @@ public class SplitLogManager extends Zoo
   private long unassignedTimeout;
   private long lastNodeCreateTime = Long.MAX_VALUE;
   public boolean ignoreZKDeleteForTesting = false;
-  private volatile long lastRecoveringNodeCreationTime = Long.MAX_VALUE;
+  private volatile long lastRecoveringNodeCreationTime = 0;
   // When lastRecoveringNodeCreationTime is older than the following threshold, we'll check
   // whether to GC stale recovering znodes
   private long checkRecoveringTimeThreshold = 15000; // 15 seconds