You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ra...@apache.org on 2012/05/28 19:14:56 UTC
svn commit: r1343324 - in /hbase/trunk/hbase-server/src:
main/java/org/apache/hadoop/hbase/master/
test/java/org/apache/hadoop/hbase/regionserver/
Author: ramkrishna
Date: Mon May 28 17:14:56 2012
New Revision: 1343324
URL: http://svn.apache.org/viewvc?rev=1343324&view=rev
Log:
HBASE-5916 RS restart just before master intialization we make the cluster non operative (Rajesh)
Modified:
hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java
Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1343324&r1=1343323&r2=1343324&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Mon May 28 17:14:56 2012
@@ -343,7 +343,7 @@ public class AssignmentManager extends Z
* @throws KeeperException
* @throws InterruptedException
*/
- void joinCluster(final Set<ServerName> onlineServers) throws IOException,
+ void joinCluster() throws IOException,
KeeperException, InterruptedException {
// Concurrency note: In the below the accesses on regionsInTransition are
// outside of a synchronization block where usually all accesses to RIT are
@@ -355,7 +355,7 @@ public class AssignmentManager extends Z
// Scan META to build list of existing regions, servers, and assignment
// Returns servers who have not checked in (assumed dead) and their regions
- Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions(onlineServers);
+ Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions();
// This method will assign all user regions if a clean server startup or
// it will reconstitute master state and cleanup any leftovers from
@@ -369,16 +369,6 @@ public class AssignmentManager extends Z
}
/**
- * Only used for tests
- * @throws IOException
- * @throws KeeperException
- * @throws InterruptedException
- */
- void joinCluster() throws IOException, KeeperException, InterruptedException {
- joinCluster(serverManager.getOnlineServers().keySet());
- }
-
- /**
* Process all regions that are in transition up in zookeeper. Used by
* master joining an already running cluster.
* @throws KeeperException
@@ -2509,11 +2499,12 @@ public class AssignmentManager extends Z
* in META
* @throws IOException
*/
- Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions(
- final Set<ServerName> onlineServers)
+ Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions()
throws IOException, KeeperException {
// Region assignment from META
List<Result> results = MetaReader.fullScan(this.catalogTracker);
+ // Get any new but slow to checkin region server that joined the cluster
+ Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
// Map of offline servers and their regions to be returned
Map<ServerName, List<Pair<HRegionInfo,Result>>> offlineServers =
new TreeMap<ServerName, List<Pair<HRegionInfo, Result>>>();
@@ -2722,7 +2713,13 @@ public class AssignmentManager extends Z
final List<String> nodes)
throws KeeperException, IOException {
if (deadServers == null) return;
+ Set<ServerName> actualDeadServers = this.serverManager.getDeadServers();
for (Map.Entry<ServerName, List<Pair<HRegionInfo, Result>>> deadServer: deadServers.entrySet()) {
+ // skip regions of dead servers because SSH will process regions during rs expiration.
+ // see HBASE-5916
+ if (actualDeadServers.contains(deadServer.getKey())) {
+ continue;
+ }
List<Pair<HRegionInfo, Result>> regions = deadServer.getValue();
for (Pair<HRegionInfo, Result> region : regions) {
HRegionInfo regionInfo = region.getFirst();
Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1343324&r1=1343323&r2=1343324&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Mon May 28 17:14:56 2012
@@ -597,11 +597,9 @@ Server {
}
this.assignmentManager.startTimeOutMonitor();
- Set<ServerName> onlineServers = new HashSet<ServerName>(serverManager
- .getOnlineServers().keySet());
// TODO: Should do this in background rather than block master startup
status.setStatus("Splitting logs after master startup");
- splitLogAfterStartup(this.fileSystemManager, onlineServers);
+ splitLogAfterStartup(this.fileSystemManager);
// Make sure root and meta assigned before proceeding.
if (!assignRootAndMeta(status)) return;
@@ -618,7 +616,7 @@ Server {
this.balancer.setMasterServices(this);
// Fixup assignment manager status
status.setStatus("Starting assignment manager");
- this.assignmentManager.joinCluster(onlineServers);
+ this.assignmentManager.joinCluster();
this.balancer.setClusterStatus(getClusterStatus());
@@ -638,7 +636,11 @@ Server {
status.markComplete("Initialization successful");
LOG.info("Master has completed initialization");
initialized = true;
-
+ // clear the dead servers with same host name and port of online server because we are not
+ // removing dead server with same hostname and port of rs which is trying to check in before
+ // master initialization. See HBASE-5916.
+ this.serverManager.clearDeadServersWithSameHostNameAndPortOfOnlineServer();
+
if (this.cpHost != null) {
// don't let cp initialization errors kill the master
try {
@@ -662,9 +664,8 @@ Server {
* @param mfs
* @param onlineServers
*/
- protected void splitLogAfterStartup(final MasterFileSystem mfs,
- Set<ServerName> onlineServers) {
- mfs.splitLogAfterStartup(onlineServers);
+ protected void splitLogAfterStartup(final MasterFileSystem mfs) {
+ mfs.splitLogAfterStartup();
}
/**
Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java?rev=1343324&r1=1343323&r2=1343324&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java Mon May 28 17:14:56 2012
@@ -190,7 +190,7 @@ public class MasterFileSystem {
* @param onlineServers Set of online servers keyed by
* {@link ServerName}
*/
- void splitLogAfterStartup(final Set<ServerName> onlineServers) {
+ void splitLogAfterStartup() {
boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
HLog.SPLIT_SKIP_ERRORS_DEFAULT);
Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME);
@@ -199,6 +199,10 @@ public class MasterFileSystem {
try {
if (!this.fs.exists(logsDirPath)) return;
FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
+ // Get online servers after getting log folders to avoid log folder deletion of newly
+ // checked in region servers . see HBASE-5916
+ Set<ServerName> onlineServers = ((HMaster) master).getServerManager().getOnlineServers()
+ .keySet();
if (logFolders == null || logFolders.length == 0) {
LOG.debug("No log files to split, proceeding...");
Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1343324&r1=1343323&r2=1343324&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Mon May 28 17:14:56 2012
@@ -200,7 +200,10 @@ public class ServerManager {
existingServer + " looks stale, new server:" + serverName);
expireServer(existingServer);
}
- throw new PleaseHoldException(message);
+ if (services.isServerShutdownHandlerEnabled()) {
+ // master has completed the initialization
+ throw new PleaseHoldException(message);
+ }
}
}
@@ -247,8 +250,10 @@ public class ServerManager {
LOG.debug(message);
throw new YouAreDeadException(message);
}
-
- if (this.deadservers.cleanPreviousInstance(serverName)) {
+ // remove dead server with same hostname and port of newly checking in rs after master
+ // initialization.See HBASE-5916 for more information.
+ if ((this.services == null || ((HMaster) this.services).isInitialized())
+ && this.deadservers.cleanPreviousInstance(serverName)) {
// This server has now become alive after we marked it as dead.
// We removed it's previous entry from the dead list to reflect it.
LOG.debug(what + ":" + " Server " + serverName + " came back up," +
@@ -737,5 +742,18 @@ public class ServerManager {
}
}
}
+
+ /**
+ * To clear any dead server with same host name and port of any online server
+ */
+ void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
+ ServerName sn = null;
+ for (ServerName serverName : getOnlineServersList()) {
+ while ((sn = ServerName.
+ findServerWithSameHostnamePort(this.deadservers, serverName)) != null) {
+ this.deadservers.remove(sn);
+ }
+ }
+ }
}
Modified: hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java?rev=1343324&r1=1343323&r2=1343324&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java (original)
+++ hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java Mon May 28 17:14:56 2012
@@ -49,6 +49,8 @@ import org.apache.hadoop.hbase.master.Te
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
import org.apache.hadoop.hbase.util.Threads;
+import org.apache.hadoop.hbase.zookeeper.ZKAssign;
+import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.hbase.LargeTests;
import org.apache.zookeeper.KeeperException;
import org.junit.AfterClass;
@@ -98,9 +100,8 @@ public class TestRSKilledWhenMasterIniti
}
@Override
- protected void splitLogAfterStartup(MasterFileSystem mfs,
- Set<ServerName> onlineServers) {
- super.splitLogAfterStartup(mfs, onlineServers);
+ protected void splitLogAfterStartup(MasterFileSystem mfs) {
+ super.splitLogAfterStartup(mfs);
logSplit = true;
// If "TestingMaster.sleep" is set, sleep after log split.
if (getConfiguration().getBoolean("TestingMaster.sleep", false)) {
@@ -215,6 +216,10 @@ public class TestRSKilledWhenMasterIniti
while (serverManager.areDeadServersInProgress()) {
Thread.sleep(100);
}
+ // Create a ZKW to use in the test
+ ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TESTUTIL);
+ ZKAssign.blockUntilNoRIT(zkw);
+
table = new HTable(TESTUTIL.getConfiguration(), TABLENAME);
resultScanner = table.getScanner(new Scan());
count = 0;