You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ra...@apache.org on 2012/05/29 18:33:41 UTC
svn commit: r1343824 - in /hbase/branches/0.92/src:
main/java/org/apache/hadoop/hbase/master/
test/java/org/apache/hadoop/hbase/regionserver/
Author: ramkrishna
Date: Tue May 29 16:33:41 2012
New Revision: 1343824
URL: http://svn.apache.org/viewvc?rev=1343824&view=rev
Log:
HBASE-5916 RS restart just before master intialization we make the cluster non operative (Rajesh)
Modified:
hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java
Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1343824&r1=1343823&r2=1343824&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Tue May 29 16:33:41 2012
@@ -325,12 +325,11 @@ public class AssignmentManager extends Z
/**
* Called on startup.
* Figures whether a fresh cluster start of we are joining extant running cluster.
- * @param onlineServers onlined servers when master started
* @throws IOException
* @throws KeeperException
* @throws InterruptedException
*/
- void joinCluster(final Set<ServerName> onlineServers) throws IOException,
+ void joinCluster() throws IOException,
KeeperException, InterruptedException {
// Concurrency note: In the below the accesses on regionsInTransition are
// outside of a synchronization block where usually all accesses to RIT are
@@ -342,7 +341,7 @@ public class AssignmentManager extends Z
// Scan META to build list of existing regions, servers, and assignment
// Returns servers who have not checked in (assumed dead) and their regions
- Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions(onlineServers);
+ Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions();
processDeadServersAndRegionsInTransition(deadServers);
@@ -353,16 +352,6 @@ public class AssignmentManager extends Z
}
/**
- * Only used for tests
- * @throws IOException
- * @throws KeeperException
- * @throws InterruptedException
- */
- void joinCluster() throws IOException, KeeperException, InterruptedException {
- joinCluster(serverManager.getOnlineServers().keySet());
- }
-
- /**
* Process all regions that are in transition up in zookeeper. Used by
* master joining an already running cluster.
* @throws KeeperException
@@ -2354,17 +2343,16 @@ public class AssignmentManager extends Z
* <p>
* Returns a map of servers that are not found to be online and the regions
* they were hosting.
- * @param onlineServers if one region's location belongs to onlineServers, it
- * doesn't need to be assigned.
* @return map of servers not online to their assigned regions, as stored
* in META
* @throws IOException
*/
- Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions(
- final Set<ServerName> onlineServers)
- throws IOException, KeeperException {
+ Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions() throws IOException,
+ KeeperException {
// Region assignment from META
List<Result> results = MetaReader.fullScan(this.catalogTracker);
+ // Get any new but slow to checkin region server that joined the cluster
+ Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
// Map of offline servers and their regions to be returned
Map<ServerName, List<Pair<HRegionInfo,Result>>> offlineServers =
new TreeMap<ServerName, List<Pair<HRegionInfo, Result>>>();
@@ -2538,8 +2526,15 @@ public class AssignmentManager extends Z
Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers,
List<String> nodes) throws IOException, KeeperException {
if (null != deadServers) {
+ Set<ServerName> actualDeadServers = this.serverManager.getDeadServers();
for (Map.Entry<ServerName, List<Pair<HRegionInfo, Result>>> deadServer :
deadServers.entrySet()) {
+ // skip regions of dead servers because SSH will process regions during rs expiration.
+ // see HBASE-5916
+ if (actualDeadServers.contains(deadServer.getKey())) {
+ continue;
+ }
+
List<Pair<HRegionInfo, Result>> regions = deadServer.getValue();
for (Pair<HRegionInfo, Result> region : regions) {
HRegionInfo regionInfo = region.getFirst();
Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1343824&r1=1343823&r2=1343824&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue May 29 16:33:41 2012
@@ -515,11 +515,9 @@ Server {
}
this.assignmentManager.startTimeOutMonitor();
- Set<ServerName> onlineServers = new HashSet<ServerName>(serverManager
- .getOnlineServers().keySet());
// TODO: Should do this in background rather than block master startup
status.setStatus("Splitting logs after master startup");
- splitLogAfterStartup(this.fileSystemManager, onlineServers);
+ splitLogAfterStartup(this.fileSystemManager);
// Make sure root and meta assigned before proceeding.
assignRootAndMeta(status);
@@ -535,7 +533,7 @@ Server {
// Fixup assignment manager status
status.setStatus("Starting assignment manager");
- this.assignmentManager.joinCluster(onlineServers);
+ this.assignmentManager.joinCluster();
this.balancer.setClusterStatus(getClusterStatus());
this.balancer.setMasterServices(this);
@@ -557,6 +555,11 @@ Server {
LOG.info("Master has completed initialization");
initialized = true;
+ // clear the dead servers with same host name and port of online server because we are not
+ // removing dead server with same hostname and port of rs which is trying to check in before
+ // master initialization. See HBASE-5916.
+ this.serverManager.clearDeadServersWithSameHostNameAndPortOfOnlineServer();
+
if (this.cpHost != null) {
// don't let cp initialization errors kill the master
try {
@@ -577,11 +580,9 @@ Server {
/**
* Override to change master's splitLogAfterStartup. Used testing
* @param mfs
- * @param onlineServers
*/
- protected void splitLogAfterStartup(final MasterFileSystem mfs,
- Set<ServerName> onlineServers) {
- mfs.splitLogAfterStartup(onlineServers);
+ protected void splitLogAfterStartup(final MasterFileSystem mfs) {
+ mfs.splitLogAfterStartup();
}
/**
Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java?rev=1343824&r1=1343823&r2=1343824&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java Tue May 29 16:33:41 2012
@@ -183,10 +183,8 @@ public class MasterFileSystem {
/**
* Inspect the log directory to recover any log file without
* an active region server.
- * @param onlineServers Set of online servers keyed by
- * {@link ServerName}
*/
- void splitLogAfterStartup(final Set<ServerName> onlineServers) {
+ void splitLogAfterStartup() {
boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
HLog.SPLIT_SKIP_ERRORS_DEFAULT);
Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME);
@@ -195,6 +193,10 @@ public class MasterFileSystem {
try {
if (!this.fs.exists(logsDirPath)) return;
FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
+ // Get online servers after getting log folders to avoid log folder deletion of newly
+ // checked in region servers . see HBASE-5916
+ Set<ServerName> onlineServers = ((HMaster) master).getServerManager().getOnlineServers()
+ .keySet();
if (logFolders == null || logFolders.length == 0) {
LOG.debug("No log files to split, proceeding...");
Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1343824&r1=1343823&r2=1343824&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Tue May 29 16:33:41 2012
@@ -189,7 +189,10 @@ public class ServerManager {
existingServer + " looks stale, new server:" + serverName);
expireServer(existingServer);
}
- throw new PleaseHoldException(message);
+ if (services.isServerShutdownHandlerEnabled()) {
+ // master has completed the initialization
+ throw new PleaseHoldException(message);
+ }
}
}
@@ -232,7 +235,10 @@ public class ServerManager {
throw new YouAreDeadException(message);
}
- if (this.deadservers.cleanPreviousInstance(serverName)) {
+ // remove dead server with same hostname and port of newly checking in rs after master
+ // initialization.See HBASE-5916 for more information.
+ if ((this.services == null || ((HMaster) this.services).isInitialized())
+ && this.deadservers.cleanPreviousInstance(serverName)) {
// This server has now become alive after we marked it as dead.
// We removed it's previous entry from the dead list to reflect it.
LOG.debug(what + ":" + " Server " + serverName + " came back up," +
@@ -624,4 +630,18 @@ public class ServerManager {
}
}
}
+
+ /**
+ * To clear any dead server with same host name and port of any online server
+ */
+ void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
+ ServerName sn = null;
+ for (ServerName serverName : getOnlineServersList()) {
+ while ((sn = ServerName.
+ findServerWithSameHostnamePort(this.deadservers, serverName)) != null) {
+ this.deadservers.remove(sn);
+ }
+ }
+ }
+
}
Modified: hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java?rev=1343824&r1=1343823&r2=1343824&view=diff
==============================================================================
--- hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java (original)
+++ hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java Tue May 29 16:33:41 2012
@@ -49,6 +49,8 @@ import org.apache.hadoop.hbase.master.Te
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
import org.apache.hadoop.hbase.util.Threads;
+import org.apache.hadoop.hbase.zookeeper.ZKAssign;
+import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.zookeeper.KeeperException;
import org.junit.AfterClass;
import org.junit.BeforeClass;
@@ -95,9 +97,8 @@ public class TestRSKilledWhenMasterIniti
}
@Override
- protected void splitLogAfterStartup(MasterFileSystem mfs,
- Set<ServerName> onlineServers) {
- super.splitLogAfterStartup(mfs, onlineServers);
+ protected void splitLogAfterStartup(MasterFileSystem mfs) {
+ super.splitLogAfterStartup(mfs);
logSplit = true;
// If "TestingMaster.sleep" is set, sleep after log split.
if (getConfiguration().getBoolean("TestingMaster.sleep", false)) {
@@ -212,6 +213,10 @@ public class TestRSKilledWhenMasterIniti
while (serverManager.areDeadServersInProgress()) {
Thread.sleep(100);
}
+ // Create a ZKW to use in the test
+ ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TESTUTIL);
+ ZKAssign.blockUntilNoRIT(zkw);
+
table = new HTable(TESTUTIL.getConfiguration(), TABLENAME);
resultScanner = table.getScanner(new Scan());
count = 0;