You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by sy...@apache.org on 2016/02/22 23:20:42 UTC

[14/22] hbase git commit: HBASE-15251 During a cluster restart, Hmaster thinks it is a failover by mistake (Clara Xiong)

HBASE-15251 During a cluster restart, Hmaster thinks it is a failover by mistake (Clara Xiong)


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/8eedc967
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/8eedc967
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/8eedc967

Branch: refs/heads/hbase-12439
Commit: 8eedc967515a4d9133068962fe029160d24e6f95
Parents: f352f3c
Author: tedyu <yu...@gmail.com>
Authored: Thu Feb 18 23:46:54 2016 -0800
Committer: tedyu <yu...@gmail.com>
Committed: Thu Feb 18 23:46:54 2016 -0800

----------------------------------------------------------------------
 .../hadoop/hbase/master/AssignmentManager.java  | 80 +++++++++++++++-----
 1 file changed, 61 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/8eedc967/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
index 7639004..53a080e 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
@@ -18,6 +18,8 @@
  */
 package org.apache.hadoop.hbase.master;
 
+import com.google.common.annotations.VisibleForTesting;
+
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
@@ -44,6 +46,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.CoordinatedStateException;
@@ -92,8 +95,6 @@ import org.apache.hadoop.ipc.RemoteException;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.zookeeper.KeeperException;
 
-import com.google.common.annotations.VisibleForTesting;
-
 /**
  * Manages and performs region assignment.
  * Related communications with regionserver are all done over RPC.
@@ -443,31 +444,43 @@ public class AssignmentManager {
       if (LOG.isDebugEnabled()) {
         LOG.debug("Found dead servers out on cluster " + serverManager.getDeadServers());
       }
-    } else {
+      // Check if there are any regions on these servers
+      failover = false;
+      for (ServerName serverName : serverManager.getDeadServers().copyServerNames()) {
+        if (regionStates.getRegionAssignments().values().contains(serverName)) {
+          LOG.debug("Found regions on dead server: " + serverName);
+          failover = true;
+          break;
+        }
+      }
+    }
+    Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
+    if (!failover) {
       // If any one region except meta is assigned, it's a failover.
-      Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
       for (Map.Entry<HRegionInfo, ServerName> en:
           regionStates.getRegionAssignments().entrySet()) {
         HRegionInfo hri = en.getKey();
         if (!hri.isMetaTable()
             && onlineServers.contains(en.getValue())) {
-          LOG.debug("Found " + hri + " out on cluster");
+          LOG.debug("Found region " + hri + " out on cluster");
           failover = true;
           break;
         }
       }
-      if (!failover) {
-        // If any region except meta is in transition on a live server, it's a failover.
-        Map<String, RegionState> regionsInTransition = regionStates.getRegionsInTransition();
-        if (!regionsInTransition.isEmpty()) {
-          for (RegionState regionState: regionsInTransition.values()) {
-            ServerName serverName = regionState.getServerName();
-            if (!regionState.getRegion().isMetaRegion()
-                && serverName != null && onlineServers.contains(serverName)) {
-              LOG.debug("Found " + regionState + " in RITs");
-              failover = true;
-              break;
-            }
+    }
+    if (!failover) {
+      // If any region except meta is in transition on a live server, it's a failover.
+      Map<String, RegionState> regionsInTransition = regionStates.getRegionsInTransition();
+      if (!regionsInTransition.isEmpty()) {
+        for (RegionState regionState: regionsInTransition.values()) {
+          ServerName serverName = regionState.getServerName();
+          if (!regionState.getRegion().isMetaRegion()
+              && serverName != null && onlineServers.contains(serverName)) {
+            LOG.debug("Found " + regionState + " for region " +
+              regionState.getRegion().getRegionNameAsString() + " for server " +
+                serverName + "in RITs");
+            failover = true;
+            break;
           }
         }
       }
@@ -488,7 +501,7 @@ public class AssignmentManager {
           Path logDir = new Path(rootdir,
               DefaultWALProvider.getWALDirectoryName(serverName.toString()));
           Path splitDir = logDir.suffix(DefaultWALProvider.SPLITTING_EXT);
-          if (fs.exists(logDir) || fs.exists(splitDir)) {
+          if (checkWals(fs, logDir) || checkWals(fs, splitDir)) {
             LOG.debug("Found queued dead server " + serverName);
             failover = true;
             break;
@@ -538,8 +551,10 @@ public class AssignmentManager {
     failoverCleanupDone();
     if (!failover) {
       // Fresh cluster startup.
-      LOG.info("Clean cluster startup. Assigning user regions");
+      LOG.info("Clean cluster startup. Don't reassign user regions");
       assignAllUserRegions(allRegions);
+    } else {
+      LOG.info("Failover! Reassign user regions");
     }
     // unassign replicas of the split parents and the merged regions
     // the daughter replicas are opened in assignAllUserRegions if it was
@@ -551,6 +566,33 @@ public class AssignmentManager {
     return failover;
   }
 
+  private boolean checkWals(FileSystem fs, Path dir) throws IOException {
+    if (!fs.exists(dir)) {
+      LOG.debug(dir + " doesn't exist");
+      return false;
+    }
+    if (!fs.getFileStatus(dir).isDirectory()) {
+      LOG.warn(dir + " is not a directory");
+      return false;
+    }
+    FileStatus[] files = FSUtils.listStatus(fs, dir);
+    if (files == null || files.length == 0) {
+      LOG.debug(dir + " has no files");
+      return false;
+    }
+    for (int i = 0; i < files.length; i++) {
+      if (files[i].isFile() && files[i].getLen() > 0) {
+        LOG.debug(dir + " has a non-empty file: " + files[i].getPath());
+        return true;
+      } else if (files[i].isDirectory() && checkWals(fs, dir)) {
+        LOG.debug(dir + " is a directory and has a non-empty file: " + files[i].getPath());
+        return true;
+      }
+    }
+    LOG.debug("Found 0 non-empty wal files for :" + dir);
+    return false;
+  }
+
   /**
    * When a region is closed, it should be removed from the regionsToReopen
    * @param hri HRegionInfo of the region which was closed