You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ra...@apache.org on 2012/05/29 18:33:41 UTC

svn commit: r1343824 - in /hbase/branches/0.92/src: main/java/org/apache/hadoop/hbase/master/ test/java/org/apache/hadoop/hbase/regionserver/

Author: ramkrishna
Date: Tue May 29 16:33:41 2012
New Revision: 1343824

URL: http://svn.apache.org/viewvc?rev=1343824&view=rev
Log:
HBASE-5916 RS restart just before master intialization we make the cluster non operative (Rajesh)

Modified:
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
    hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1343824&r1=1343823&r2=1343824&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Tue May 29 16:33:41 2012
@@ -325,12 +325,11 @@ public class AssignmentManager extends Z
   /**
    * Called on startup.
    * Figures whether a fresh cluster start of we are joining extant running cluster.
-   * @param onlineServers onlined servers when master started
    * @throws IOException
    * @throws KeeperException
    * @throws InterruptedException
    */
-  void joinCluster(final Set<ServerName> onlineServers) throws IOException,
+  void joinCluster() throws IOException,
       KeeperException, InterruptedException {
     // Concurrency note: In the below the accesses on regionsInTransition are
     // outside of a synchronization block where usually all accesses to RIT are
@@ -342,7 +341,7 @@ public class AssignmentManager extends Z
 
     // Scan META to build list of existing regions, servers, and assignment
     // Returns servers who have not checked in (assumed dead) and their regions
-    Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions(onlineServers);
+    Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions();
 
     processDeadServersAndRegionsInTransition(deadServers);
 
@@ -353,16 +352,6 @@ public class AssignmentManager extends Z
   }
 
   /**
-   * Only used for tests
-   * @throws IOException
-   * @throws KeeperException
-   * @throws InterruptedException
-   */
-  void joinCluster() throws IOException, KeeperException, InterruptedException {
-    joinCluster(serverManager.getOnlineServers().keySet());
-  }
-
-  /**
    * Process all regions that are in transition up in zookeeper.  Used by
    * master joining an already running cluster.
    * @throws KeeperException
@@ -2354,17 +2343,16 @@ public class AssignmentManager extends Z
    * <p>
    * Returns a map of servers that are not found to be online and the regions
    * they were hosting.
-   * @param onlineServers if one region's location belongs to onlineServers, it
-   *          doesn't need to be assigned.
    * @return map of servers not online to their assigned regions, as stored
    *         in META
    * @throws IOException
    */
-  Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions(
-      final Set<ServerName> onlineServers)
-  throws IOException, KeeperException {
+  Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions() throws IOException,
+      KeeperException {
     // Region assignment from META
     List<Result> results = MetaReader.fullScan(this.catalogTracker);
+    // Get any new but slow to checkin region server that joined the cluster
+    Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();    
     // Map of offline servers and their regions to be returned
     Map<ServerName, List<Pair<HRegionInfo,Result>>> offlineServers =
       new TreeMap<ServerName, List<Pair<HRegionInfo, Result>>>();
@@ -2538,8 +2526,15 @@ public class AssignmentManager extends Z
       Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers,
       List<String> nodes) throws IOException, KeeperException {
     if (null != deadServers) {
+      Set<ServerName> actualDeadServers = this.serverManager.getDeadServers();
       for (Map.Entry<ServerName, List<Pair<HRegionInfo, Result>>> deadServer : 
         deadServers.entrySet()) {
+        // skip regions of dead servers because SSH will process regions during rs expiration.
+        // see HBASE-5916
+        if (actualDeadServers.contains(deadServer.getKey())) {
+          continue;
+        }
+        
         List<Pair<HRegionInfo, Result>> regions = deadServer.getValue();
         for (Pair<HRegionInfo, Result> region : regions) {
           HRegionInfo regionInfo = region.getFirst();

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1343824&r1=1343823&r2=1343824&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue May 29 16:33:41 2012
@@ -515,11 +515,9 @@ Server {
     }
 
     this.assignmentManager.startTimeOutMonitor();
-    Set<ServerName> onlineServers = new HashSet<ServerName>(serverManager
-        .getOnlineServers().keySet());
     // TODO: Should do this in background rather than block master startup
     status.setStatus("Splitting logs after master startup");
-    splitLogAfterStartup(this.fileSystemManager, onlineServers);
+    splitLogAfterStartup(this.fileSystemManager);
 
     // Make sure root and meta assigned before proceeding.
     assignRootAndMeta(status);
@@ -535,7 +533,7 @@ Server {
 
     // Fixup assignment manager status
     status.setStatus("Starting assignment manager");
-    this.assignmentManager.joinCluster(onlineServers);
+    this.assignmentManager.joinCluster();
 
     this.balancer.setClusterStatus(getClusterStatus());
     this.balancer.setMasterServices(this);
@@ -557,6 +555,11 @@ Server {
     LOG.info("Master has completed initialization");
     initialized = true;
 
+    // clear the dead servers with same host name and port of online server because we are not
+    // removing dead server with same hostname and port of rs which is trying to check in before
+    // master initialization. See HBASE-5916.
+    this.serverManager.clearDeadServersWithSameHostNameAndPortOfOnlineServer();
+    
     if (this.cpHost != null) {
       // don't let cp initialization errors kill the master
       try {
@@ -577,11 +580,9 @@ Server {
   /**
    * Override to change master's splitLogAfterStartup. Used testing
    * @param mfs
-   * @param onlineServers
    */
-  protected void splitLogAfterStartup(final MasterFileSystem mfs,
-      Set<ServerName> onlineServers) {
-    mfs.splitLogAfterStartup(onlineServers);
+  protected void splitLogAfterStartup(final MasterFileSystem mfs) {
+    mfs.splitLogAfterStartup();
   }
 
   /**

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java?rev=1343824&r1=1343823&r2=1343824&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java Tue May 29 16:33:41 2012
@@ -183,10 +183,8 @@ public class MasterFileSystem {
   /**
    * Inspect the log directory to recover any log file without
    * an active region server.
-   * @param onlineServers Set of online servers keyed by
-   * {@link ServerName}
    */
-  void splitLogAfterStartup(final Set<ServerName> onlineServers) {
+  void splitLogAfterStartup() {
     boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
         HLog.SPLIT_SKIP_ERRORS_DEFAULT);
     Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME);
@@ -195,6 +193,10 @@ public class MasterFileSystem {
       try {
         if (!this.fs.exists(logsDirPath)) return;
         FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
+        // Get online servers after getting log folders to avoid log folder deletion of newly
+        // checked in region servers . see HBASE-5916
+        Set<ServerName> onlineServers = ((HMaster) master).getServerManager().getOnlineServers()
+            .keySet();
 
         if (logFolders == null || logFolders.length == 0) {
           LOG.debug("No log files to split, proceeding...");

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1343824&r1=1343823&r2=1343824&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Tue May 29 16:33:41 2012
@@ -189,7 +189,10 @@ public class ServerManager {
           existingServer + " looks stale, new server:" + serverName);
         expireServer(existingServer);
       }
-      throw new PleaseHoldException(message);
+      if (services.isServerShutdownHandlerEnabled()) {
+        // master has completed the initialization
+        throw new PleaseHoldException(message);
+      }
     }
   }
 
@@ -232,7 +235,10 @@ public class ServerManager {
       throw new YouAreDeadException(message);
     }
 
-    if (this.deadservers.cleanPreviousInstance(serverName)) {
+    // remove dead server with same hostname and port of newly checking in rs after master
+    // initialization.See HBASE-5916 for more information.
+    if ((this.services == null || ((HMaster) this.services).isInitialized())
+        && this.deadservers.cleanPreviousInstance(serverName)) {
       // This server has now become alive after we marked it as dead.
       // We removed it's previous entry from the dead list to reflect it.
       LOG.debug(what + ":" + " Server " + serverName + " came back up," +
@@ -624,4 +630,18 @@ public class ServerManager {
       }
     }
   }
+  
+  /**
+   * To clear any dead server with same host name and port of any online server
+   */
+  void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
+    ServerName sn = null;
+    for (ServerName serverName : getOnlineServersList()) {
+      while ((sn = ServerName.
+          findServerWithSameHostnamePort(this.deadservers, serverName)) != null) {
+        this.deadservers.remove(sn);
+      }
+    }
+  }
+  
 }

Modified: hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java?rev=1343824&r1=1343823&r2=1343824&view=diff
==============================================================================
--- hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java (original)
+++ hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java Tue May 29 16:33:41 2012
@@ -49,6 +49,8 @@ import org.apache.hadoop.hbase.master.Te
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
 import org.apache.hadoop.hbase.util.Threads;
+import org.apache.hadoop.hbase.zookeeper.ZKAssign;
+import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
 import org.apache.zookeeper.KeeperException;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
@@ -95,9 +97,8 @@ public class TestRSKilledWhenMasterIniti
     }
 
     @Override
-    protected void splitLogAfterStartup(MasterFileSystem mfs,
-        Set<ServerName> onlineServers) {
-      super.splitLogAfterStartup(mfs, onlineServers);
+    protected void splitLogAfterStartup(MasterFileSystem mfs) {
+      super.splitLogAfterStartup(mfs);
       logSplit = true;
       // If "TestingMaster.sleep" is set, sleep after log split.
       if (getConfiguration().getBoolean("TestingMaster.sleep", false)) {
@@ -212,6 +213,10 @@ public class TestRSKilledWhenMasterIniti
     while (serverManager.areDeadServersInProgress()) {
       Thread.sleep(100);
     }
+    // Create a ZKW to use in the test
+    ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TESTUTIL);
+    ZKAssign.blockUntilNoRIT(zkw);
+    
     table = new HTable(TESTUTIL.getConfiguration(), TABLENAME);
     resultScanner = table.getScanner(new Scan());
     count = 0;