You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by je...@apache.org on 2013/06/04 23:05:48 UTC
svn commit: r1489609 - in
/hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master:
HMaster.java MasterFileSystem.java SplitLogManager.java
Author: jeffreyz
Date: Tue Jun 4 21:05:48 2013
New Revision: 1489609
URL: http://svn.apache.org/r1489609
Log:
hbase-8666: META region isn't fully recovered during master initialization when META region recovery had chained failures
Modified:
hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
Modified: hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1489609&r1=1489608&r2=1489609&view=diff
==============================================================================
--- hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue Jun 4 21:05:48 2013
@@ -795,6 +795,7 @@ MasterServices, Server {
// Note: we can't remove oldMetaServerLocation from previousFailedServers list because it
// may also host user regions
}
+ Set<ServerName> previouslyFailedMetaRSs = getPreviouselyFailedMetaServersFromZK();
this.initializationBeforeMetaAssignment = true;
// Make sure meta assigned before proceeding.
@@ -804,11 +805,19 @@ MasterServices, Server {
// assigned when master is shutting down
if(this.stopped) return;
- if (this.distributedLogReplay && oldMetaServerLocation != null
- && previouslyFailedServers.contains(oldMetaServerLocation)) {
+ if (this.distributedLogReplay && (!previouslyFailedMetaRSs.isEmpty())) {
// replay WAL edits mode need new .META. RS is assigned firstly
status.setStatus("replaying log for Meta Region");
- this.fileSystemManager.splitMetaLog(oldMetaServerLocation);
+ // need to use union of previouslyFailedMetaRSs recorded in ZK and previouslyFailedServers
+ // instead of oldMetaServerLocation to address the following two situations:
+ // 1) the chained failure situation(recovery failed multiple times in a row).
+ // 2) master get killed right before it could delete the recovering META from ZK while the
+ // same server still has non-meta wals to be replayed so that
+ // removeStaleRecoveringRegionsFromZK can't delete the stale META region
+ // Passing more servers into splitMetaLog is all right. If a server doesn't have .META. wal,
+ // there is no op for the server.
+ previouslyFailedMetaRSs.addAll(previouslyFailedServers);
+ this.fileSystemManager.splitMetaLog(previouslyFailedMetaRSs);
}
enableServerShutdownHandler();
@@ -992,6 +1001,25 @@ MasterServices, Server {
return true;
}
+ /**
+ * This function returns a set of region server names under .META. recovering region ZK node
+ * @return Set of meta server names which were recorded in ZK
+ * @throws KeeperException
+ */
+ private Set<ServerName> getPreviouselyFailedMetaServersFromZK() throws KeeperException {
+ Set<ServerName> result = new HashSet<ServerName>();
+ String metaRecoveringZNode = ZKUtil.joinZNode(zooKeeper.recoveringRegionsZNode,
+ HRegionInfo.FIRST_META_REGIONINFO.getEncodedName());
+ List<String> regionFailedServers = ZKUtil.listChildrenNoWatch(zooKeeper, metaRecoveringZNode);
+ if (regionFailedServers == null) return result;
+
+ for(String failedServer : regionFailedServers) {
+ ServerName server = ServerName.parseServerName(failedServer);
+ result.add(server);
+ }
+ return result;
+ }
+
@Override
public TableDescriptors getTableDescriptors() {
return this.tableDescriptors;
Modified: hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java?rev=1489609&r1=1489608&r2=1489609&view=diff
==============================================================================
--- hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (original)
+++ hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java Tue Jun 4 21:05:48 2013
@@ -294,9 +294,18 @@ public class MasterFileSystem {
* @throws IOException
*/
public void splitMetaLog(final ServerName serverName) throws IOException {
- long splitTime = 0, splitLogSize = 0;
Set<ServerName> serverNames = new HashSet<ServerName>();
serverNames.add(serverName);
+ splitMetaLog(serverNames);
+ }
+
+ /**
+ * Specialized method to handle the splitting for meta HLog
+ * @param serverNames
+ * @throws IOException
+ */
+ public void splitMetaLog(final Set<ServerName> serverNames) throws IOException {
+ long splitTime = 0, splitLogSize = 0;
List<Path> logDirs = getLogDirs(serverNames);
splitLogManager.handleDeadWorkers(serverNames);
Modified: hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java?rev=1489609&r1=1489608&r2=1489609&view=diff
==============================================================================
--- hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java (original)
+++ hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java Tue Jun 4 21:05:48 2013
@@ -124,7 +124,7 @@ public class SplitLogManager extends Zoo
private long unassignedTimeout;
private long lastNodeCreateTime = Long.MAX_VALUE;
public boolean ignoreZKDeleteForTesting = false;
- private volatile long lastRecoveringNodeCreationTime = Long.MAX_VALUE;
+ private volatile long lastRecoveringNodeCreationTime = 0;
// When lastRecoveringNodeCreationTime is older than the following threshold, we'll check
// whether to GC stale recovering znodes
private long checkRecoveringTimeThreshold = 15000; // 15 seconds