You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ra...@apache.org on 2012/05/28 19:14:56 UTC

svn commit: r1343324 - in /hbase/trunk/hbase-server/src: main/java/org/apache/hadoop/hbase/master/ test/java/org/apache/hadoop/hbase/regionserver/

Author: ramkrishna
Date: Mon May 28 17:14:56 2012
New Revision: 1343324

URL: http://svn.apache.org/viewvc?rev=1343324&view=rev
Log:
HBASE-5916 RS restart just before master intialization we make the cluster non operative (Rajesh)

Modified:
    hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
    hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
    hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java

Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1343324&r1=1343323&r2=1343324&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Mon May 28 17:14:56 2012
@@ -343,7 +343,7 @@ public class AssignmentManager extends Z
    * @throws KeeperException
    * @throws InterruptedException
    */
-  void joinCluster(final Set<ServerName> onlineServers) throws IOException,
+  void joinCluster() throws IOException,
       KeeperException, InterruptedException {
     // Concurrency note: In the below the accesses on regionsInTransition are
     // outside of a synchronization block where usually all accesses to RIT are
@@ -355,7 +355,7 @@ public class AssignmentManager extends Z
 
     // Scan META to build list of existing regions, servers, and assignment
     // Returns servers who have not checked in (assumed dead) and their regions
-    Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions(onlineServers);
+    Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions();
 
     // This method will assign all user regions if a clean server startup or
     // it will reconstitute master state and cleanup any leftovers from
@@ -369,16 +369,6 @@ public class AssignmentManager extends Z
   }
 
   /**
-   * Only used for tests
-   * @throws IOException
-   * @throws KeeperException
-   * @throws InterruptedException
-   */
-  void joinCluster() throws IOException, KeeperException, InterruptedException {
-    joinCluster(serverManager.getOnlineServers().keySet());
-  }
-
-  /**
    * Process all regions that are in transition up in zookeeper.  Used by
    * master joining an already running cluster.
    * @throws KeeperException
@@ -2509,11 +2499,12 @@ public class AssignmentManager extends Z
    *         in META
    * @throws IOException
    */
-  Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions(
-      final Set<ServerName> onlineServers)
+  Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions()
   throws IOException, KeeperException {
     // Region assignment from META
     List<Result> results = MetaReader.fullScan(this.catalogTracker);
+    // Get any new but slow to checkin region server that joined the cluster
+    Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
     // Map of offline servers and their regions to be returned
     Map<ServerName, List<Pair<HRegionInfo,Result>>> offlineServers =
       new TreeMap<ServerName, List<Pair<HRegionInfo, Result>>>();
@@ -2722,7 +2713,13 @@ public class AssignmentManager extends Z
       final List<String> nodes)
   throws KeeperException, IOException {
     if (deadServers == null) return;
+    Set<ServerName> actualDeadServers = this.serverManager.getDeadServers();
     for (Map.Entry<ServerName, List<Pair<HRegionInfo, Result>>> deadServer: deadServers.entrySet()) {
+      // skip regions of dead servers because SSH will process regions during rs expiration. 
+      // see HBASE-5916
+      if (actualDeadServers.contains(deadServer.getKey())) {
+        continue;
+      }
       List<Pair<HRegionInfo, Result>> regions = deadServer.getValue();
       for (Pair<HRegionInfo, Result> region : regions) {
         HRegionInfo regionInfo = region.getFirst();

Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1343324&r1=1343323&r2=1343324&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Mon May 28 17:14:56 2012
@@ -597,11 +597,9 @@ Server {
     }
 
     this.assignmentManager.startTimeOutMonitor();
-    Set<ServerName> onlineServers = new HashSet<ServerName>(serverManager
-        .getOnlineServers().keySet());
     // TODO: Should do this in background rather than block master startup
     status.setStatus("Splitting logs after master startup");
-    splitLogAfterStartup(this.fileSystemManager, onlineServers);
+    splitLogAfterStartup(this.fileSystemManager);
 
     // Make sure root and meta assigned before proceeding.
     if (!assignRootAndMeta(status)) return;
@@ -618,7 +616,7 @@ Server {
     this.balancer.setMasterServices(this);
     // Fixup assignment manager status
     status.setStatus("Starting assignment manager");
-    this.assignmentManager.joinCluster(onlineServers);
+    this.assignmentManager.joinCluster();
 
     this.balancer.setClusterStatus(getClusterStatus());
 
@@ -638,7 +636,11 @@ Server {
     status.markComplete("Initialization successful");
     LOG.info("Master has completed initialization");
     initialized = true;
-
+    // clear the dead servers with same host name and port of online server because we are not
+    // removing dead server with same hostname and port of rs which is trying to check in before
+    // master initialization. See HBASE-5916.
+    this.serverManager.clearDeadServersWithSameHostNameAndPortOfOnlineServer();
+    
     if (this.cpHost != null) {
       // don't let cp initialization errors kill the master
       try {
@@ -662,9 +664,8 @@ Server {
    * @param mfs
    * @param onlineServers
    */
-  protected void splitLogAfterStartup(final MasterFileSystem mfs,
-      Set<ServerName> onlineServers) {
-    mfs.splitLogAfterStartup(onlineServers);
+  protected void splitLogAfterStartup(final MasterFileSystem mfs) {
+    mfs.splitLogAfterStartup();
   }
 
   /**

Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java?rev=1343324&r1=1343323&r2=1343324&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java Mon May 28 17:14:56 2012
@@ -190,7 +190,7 @@ public class MasterFileSystem {
    * @param onlineServers Set of online servers keyed by
    * {@link ServerName}
    */
-  void splitLogAfterStartup(final Set<ServerName> onlineServers) {
+  void splitLogAfterStartup() {
     boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
         HLog.SPLIT_SKIP_ERRORS_DEFAULT);
     Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME);
@@ -199,6 +199,10 @@ public class MasterFileSystem {
       try {
         if (!this.fs.exists(logsDirPath)) return;
         FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
+        // Get online servers after getting log folders to avoid log folder deletion of newly
+        // checked in region servers . see HBASE-5916
+        Set<ServerName> onlineServers = ((HMaster) master).getServerManager().getOnlineServers()
+            .keySet();
 
         if (logFolders == null || logFolders.length == 0) {
           LOG.debug("No log files to split, proceeding...");

Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1343324&r1=1343323&r2=1343324&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Mon May 28 17:14:56 2012
@@ -200,7 +200,10 @@ public class ServerManager {
           existingServer + " looks stale, new server:" + serverName);
         expireServer(existingServer);
       }
-      throw new PleaseHoldException(message);
+      if (services.isServerShutdownHandlerEnabled()) {
+        // master has completed the initialization
+        throw new PleaseHoldException(message);
+      }
     }
   }
 
@@ -247,8 +250,10 @@ public class ServerManager {
       LOG.debug(message);
       throw new YouAreDeadException(message);
     }
-
-    if (this.deadservers.cleanPreviousInstance(serverName)) {
+    // remove dead server with same hostname and port of newly checking in rs after master
+    // initialization.See HBASE-5916 for more information.
+    if ((this.services == null || ((HMaster) this.services).isInitialized())
+        && this.deadservers.cleanPreviousInstance(serverName)) {
       // This server has now become alive after we marked it as dead.
       // We removed it's previous entry from the dead list to reflect it.
       LOG.debug(what + ":" + " Server " + serverName + " came back up," +
@@ -737,5 +742,18 @@ public class ServerManager {
       }
     }
   }
+  
+  /**
+   * To clear any dead server with same host name and port of any online server
+   */
+  void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
+    ServerName sn = null;
+    for (ServerName serverName : getOnlineServersList()) {
+      while ((sn = ServerName.
+          findServerWithSameHostnamePort(this.deadservers, serverName)) != null) {
+        this.deadservers.remove(sn);
+      }
+    }
+  }
 
 }

Modified: hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java?rev=1343324&r1=1343323&r2=1343324&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java (original)
+++ hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java Mon May 28 17:14:56 2012
@@ -49,6 +49,8 @@ import org.apache.hadoop.hbase.master.Te
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
 import org.apache.hadoop.hbase.util.Threads;
+import org.apache.hadoop.hbase.zookeeper.ZKAssign;
+import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
 import org.apache.hadoop.hbase.LargeTests;
 import org.apache.zookeeper.KeeperException;
 import org.junit.AfterClass;
@@ -98,9 +100,8 @@ public class TestRSKilledWhenMasterIniti
     }
 
     @Override
-    protected void splitLogAfterStartup(MasterFileSystem mfs,
-        Set<ServerName> onlineServers) {
-      super.splitLogAfterStartup(mfs, onlineServers);
+    protected void splitLogAfterStartup(MasterFileSystem mfs) {
+      super.splitLogAfterStartup(mfs);
       logSplit = true;
       // If "TestingMaster.sleep" is set, sleep after log split.
       if (getConfiguration().getBoolean("TestingMaster.sleep", false)) {
@@ -215,6 +216,10 @@ public class TestRSKilledWhenMasterIniti
     while (serverManager.areDeadServersInProgress()) {
       Thread.sleep(100);
     }
+    // Create a ZKW to use in the test
+    ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TESTUTIL);
+    ZKAssign.blockUntilNoRIT(zkw);
+    
     table = new HTable(TESTUTIL.getConfiguration(), TABLENAME);
     resultScanner = table.getScanner(new Scan());
     count = 0;