You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2018/10/13 00:40:27 UTC

hbase git commit: HBASE-21259 [amv2] Revived deadservers; recreated serverstatenode

Repository: hbase
Updated Branches:
  refs/heads/branch-2.1 4d50f6db5 -> 72af27b8c


HBASE-21259 [amv2] Revived deadservers; recreated serverstatenode

Remove a bunch of places where we create ServerStateNode. We were
creating a SSN even though the server was long dead and processed.
The revived SSN was messing up the little dance we do unassigning
procedures. In particular, in UnassignProcedure, the check for a
dead server inside in isLogSplittingDone returns true -- we can
proceed because server is dead -- fails if an SSN exists.

We were creating SSN when we didn't need it as well as inadvertently.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
 Print serverstatenode when reporting expiration. Helps debugging.
 Make moveFromOnlineToDeadServers return if server online or not.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
 Make do w/ serverName in place of serverNode in a few places.
 In waitServerReportEvent, create a ServerStateNode if none though we
 should not have to at this point; to figure out later: TODO.
 addRegionToServer no longer automatically calls create SSN
 so do explicit create processing load meta and the region
 is OPEN so we can associate OPEN regions with the SSN.
 Do not schedule an SCP if server is not online, not in fs, and not in
 dead servers. No point (and there may be cases where server is long
 gone but hbase:meta still refers to it though it has not carried
 regions in a long time; running an assign/unassign against such a
 server will fail because it is not there but SCP won't clean up
 the outstanding hung RPC because our region is not on the long-gone
 server).

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java
 Just cleanup. Make it so addRegionToServer and remove can deal if no SSN.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java
 Add isWALDirectoryNameWithWALS utility.


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/72af27b8
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/72af27b8
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/72af27b8

Branch: refs/heads/branch-2.1
Commit: 72af27b8c96c2fd10516c027074d6f0ad2110f3a
Parents: 4d50f6d
Author: Michael Stack <st...@apache.org>
Authored: Mon Oct 8 16:54:33 2018 -0700
Committer: Michael Stack <st...@apache.org>
Committed: Fri Oct 12 17:40:11 2018 -0700

----------------------------------------------------------------------
 .../hadoop/hbase/master/MasterFileSystem.java   |  6 +-
 .../hadoop/hbase/master/MasterWalManager.java   | 64 ++++++++++++--
 .../hadoop/hbase/master/ServerManager.java      | 47 ++++++++---
 .../hadoop/hbase/master/SplitLogManager.java    |  4 +-
 .../master/assignment/AssignmentManager.java    | 62 ++++++++------
 .../hbase/master/assignment/RegionStates.java   | 30 +++++--
 .../assignment/RegionTransitionProcedure.java   |  4 +-
 .../master/assignment/UnassignProcedure.java    |  9 +-
 .../master/balancer/RegionLocationFinder.java   |  6 +-
 .../hbase/master/TestMasterWALManager.java      | 87 ++++++++++++++++++++
 10 files changed, 259 insertions(+), 60 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/72af27b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
index 864be02..a06a369 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
@@ -199,14 +199,16 @@ public class MasterFileSystem {
   }
 
   /**
-   * @return HBase root dir.
+   * @return HBase root directory.
    */
   public Path getRootDir() {
     return this.rootdir;
   }
 
   /**
-   * @return HBase root log dir.
+   * @return HBase WAL root directory, usually the same as {@link #getRootDir()} but can be
+   *   different if hfiles on one fs and WALs on another. The 'WALs' dir gets made underneath
+   *   the root dir returned here; i.e. this is '/hbase', not '/hbase/WALs'.
    */
   public Path getWALRootDir() {
     return this.walRootDir;

http://git-wip-us.apache.org/repos/asf/hbase/blob/72af27b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java
index 2b1a81f..21112a1 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java
@@ -77,6 +77,11 @@ public class MasterWalManager {
 
   // The Path to the old logs dir
   private final Path oldLogDir;
+
+  /**
+   * This is the hbase rootdir.
+   * We'll put the WALs under this dir.
+   */
   private final Path rootDir;
 
   // create the split log lock
@@ -183,10 +188,17 @@ public class MasterWalManager {
   }
 
   /**
+   * @return Returns the WALs dir under <code>rootDir</code>
+   */
+  Path getWALDirPath() {
+    return new Path(this.rootDir, HConstants.HREGION_LOGDIR_NAME);
+  }
+
+  /**
    * @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME.
    */
   public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException {
-    Path walDirPath = new Path(rootDir, HConstants.HREGION_LOGDIR_NAME);
+    Path walDirPath = getWALDirPath();
     FileStatus[] walDirForServerNames = FSUtils.listStatus(fs, walDirPath, filter);
     return walDirForServerNames == null? new FileStatus[0]: walDirForServerNames;
   }
@@ -205,7 +217,7 @@ public class MasterWalManager {
         WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);
 
     Set<ServerName> serverNames = new HashSet<>();
-    Path logsDirPath = new Path(this.rootDir, HConstants.HREGION_LOGDIR_NAME);
+    Path logsDirPath = getWALDirPath();
 
     do {
       if (services.isStopped()) {
@@ -286,10 +298,50 @@ public class MasterWalManager {
     splitLog(serverNames, META_FILTER);
   }
 
+  /**
+   * @return True if a WAL directory exists (will return true also if WALs found in
+   *   servername'-splitting' too).
+   */
+  boolean isWALDirectoryNameWithWALs(ServerName serverName) {
+    FileStatus [] fss = null;
+    try {
+      // 'startsWith' will also return dirs ending in AbstractFSWALProvider.SPLITTING_EXT
+      fss = getWALDirPaths(p -> p.getName().startsWith(serverName.toString()));
+    } catch (IOException ioe) {
+      LOG.warn("{}", serverName, ioe);
+      // Something wrong reading from fs. Returning 'true' to bring on more fs activity
+      return true;
+    }
+    if (fss != null) {
+      for (FileStatus fileStatus: fss) {
+        if (fileStatus.isDirectory()) {
+          // Not testing for existence; presuming exists if we got it out of getWALDirPaths
+          // listing. I used to test for presence of WAL and return false if empty but it can be
+          // empty if a clean shutdown. Even clean shutdowns need to be recovered so the meta
+          // and namespace assigns get triggered.
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Depends on current FS Layout!
+   * @return The Path to the WAL directory for <code>serverName</code>
+   */
+  Path getWALDirectoryName(ServerName serverName) {
+    return new Path(this.rootDir, AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
+  }
+
+  /**
+   * Finds WAL dirs for <code>serverNames</code> and renames them with '-splitting' suffix.
+   * @return List of '-splitting' directories that pertain to <code>serverNames</code>
+   */
   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK", justification=
       "We only release this lock when we set it. Updates to code that uses it should verify use " +
       "of the guard boolean.")
-  private List<Path> getLogDirs(final Set<ServerName> serverNames) throws IOException {
+  List<Path> createAndGetLogDirs(final Set<ServerName> serverNames) throws IOException {
     List<Path> logDirs = new ArrayList<>();
     boolean needReleaseLock = false;
     if (!this.services.isInitialized()) {
@@ -300,8 +352,8 @@ public class MasterWalManager {
     }
     try {
       for (ServerName serverName : serverNames) {
-        Path logDir = new Path(this.rootDir,
-          AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
+        Path logDir = getWALDirectoryName(serverName);
+        // This adds the -splitting suffix to logDir.
         Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
         // Rename the directory so a rogue RS doesn't create more WALs
         if (fs.exists(logDir)) {
@@ -341,7 +393,7 @@ public class MasterWalManager {
    */
   public void splitLog(final Set<ServerName> serverNames, PathFilter filter) throws IOException {
     long splitTime = 0, splitLogSize = 0;
-    List<Path> logDirs = getLogDirs(serverNames);
+    List<Path> logDirs = createAndGetLogDirs(serverNames);
 
     splitLogManager.handleDeadWorkers(serverNames);
     splitTime = EnvironmentEdgeManager.currentTime();

http://git-wip-us.apache.org/repos/asf/hbase/blob/72af27b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index ee61747..bb5592a 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -34,6 +34,7 @@ import java.util.concurrent.ConcurrentSkipListMap;
 import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.function.Predicate;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.ClockOutOfSyncException;
 import org.apache.hadoop.hbase.HConstants;
@@ -247,8 +248,7 @@ public class ServerManager {
   }
 
   @VisibleForTesting
-  public void regionServerReport(ServerName sn,
-    ServerMetrics sl) throws YouAreDeadException {
+  public void regionServerReport(ServerName sn, ServerMetrics sl) throws YouAreDeadException {
     checkIsDead(sn, "REPORT");
     if (null == this.onlineServers.replace(sn, sl)) {
       // Already have this host+port combo and its just different start code?
@@ -529,6 +529,16 @@ public class ServerManager {
   }
 
   /**
+   * @return True if we should expire <code>serverName</code>
+   */
+  boolean expire(ServerName serverName) {
+    return this.onlineServers.containsKey(serverName) ||
+        this.deadservers.isDeadServer(serverName) ||
+        this.master.getAssignmentManager().getRegionStates().getServerNode(serverName) != null ||
+        this.master.getMasterWalManager().isWALDirectoryNameWithWALs(serverName);
+  }
+
+  /**
    * Expire the passed server. Add it to list of dead servers and queue a shutdown processing.
    * @return True if we queued a ServerCrashProcedure else false if we did not (could happen for
    *         many reasons including the fact that its this server that is going down or we already
@@ -543,11 +553,24 @@ public class ServerManager {
       }
       return false;
     }
+    // Check if we should bother running an expire!
+    if (!expire(serverName)) {
+      LOG.info("Skipping expire; {} is not online, not in deadservers, not in fs -- presuming " +
+          "long gone server instance!", serverName);
+      return false;
+    }
+
     if (this.deadservers.isDeadServer(serverName)) {
-      LOG.warn("Expiration called on {} but crash processing already in progress", serverName);
+      LOG.warn("Expiration called on {} but crash processing in progress, serverStateNode={}",
+          serverName,
+          this.master.getAssignmentManager().getRegionStates().getServerNode(serverName));
       return false;
     }
-    moveFromOnlineToDeadServers(serverName);
+
+    if (!moveFromOnlineToDeadServers(serverName)) {
+      LOG.info("Expiration called on {} but NOT online", serverName);
+      // Continue.
+    }
 
     // If cluster is going down, yes, servers are going to be expiring; don't
     // process as a dead server
@@ -571,20 +594,24 @@ public class ServerManager {
     return true;
   }
 
+  /**
+   * @return Returns true if was online.
+   */
   @VisibleForTesting
-  public void moveFromOnlineToDeadServers(final ServerName sn) {
+  public boolean moveFromOnlineToDeadServers(final ServerName sn) {
+    boolean online = false;
     synchronized (onlineServers) {
-      if (!this.onlineServers.containsKey(sn)) {
-        LOG.trace("Expiration of {} but server not online", sn);
-      }
       // Remove the server from the known servers lists and update load info BUT
       // add to deadservers first; do this so it'll show in dead servers list if
       // not in online servers list.
       this.deadservers.add(sn);
-      this.onlineServers.remove(sn);
-      onlineServers.notifyAll();
+      if (this.onlineServers.remove(sn) != null) {
+        online = true;
+        onlineServers.notifyAll();
+      }
     }
     this.rsAdmins.remove(sn);
+    return online;
   }
 
   /*

http://git-wip-us.apache.org/repos/asf/hbase/blob/72af27b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
index 4d977d3..414fc98 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
@@ -452,7 +452,7 @@ public class SplitLogManager {
       }
       deadWorkers.add(workerName);
     }
-    LOG.info("Dead splitlog worker {}", workerName);
+    LOG.debug("Dead splitlog worker {}", workerName);
   }
 
   void handleDeadWorkers(Set<ServerName> serverNames) {
@@ -462,7 +462,7 @@ public class SplitLogManager {
       }
       deadWorkers.addAll(serverNames);
     }
-    LOG.info("dead splitlog workers " + serverNames);
+    LOG.debug("Dead splitlog workers {}", serverNames);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hbase/blob/72af27b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index 04f888c..49fe93a 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -824,6 +824,10 @@ public class AssignmentManager implements ServerListener {
     return builder.build();
   }
 
+  /**
+   * Called when the RegionServer wants to report a Procedure transition.
+   * Ends up calling {@link #reportTransition(RegionStateNode, ServerName, TransitionCode, long)}
+   */
   private void updateRegionTransition(final ServerName serverName, final TransitionCode state,
       final RegionInfo regionInfo, final long seqId)
       throws PleaseHoldException, UnexpectedStateException {
@@ -842,8 +846,7 @@ public class AssignmentManager implements ServerListener {
         serverName, regionNode, state));
     }
 
-    final ServerStateNode serverNode = regionStates.getOrCreateServer(serverName);
-    if (!reportTransition(regionNode, serverNode, state, seqId)) {
+    if (!reportTransition(regionNode, serverName, state, seqId)) {
       // Don't log WARN if shutting down cluster; during shutdown. Avoid the below messages:
       // 2018-08-13 10:45:10,551 WARN ...AssignmentManager: No matching procedure found for
       //   rit=OPEN, location=ve0538.halxg.cloudera.com,16020,1533493000958,
@@ -862,10 +865,9 @@ public class AssignmentManager implements ServerListener {
   }
 
   // FYI: regionNode is sometimes synchronized by the caller but not always.
-  private boolean reportTransition(final RegionStateNode regionNode,
-      final ServerStateNode serverNode, final TransitionCode state, final long seqId)
+  private boolean reportTransition(final RegionStateNode regionNode, ServerName serverName,
+      final TransitionCode state, final long seqId)
       throws UnexpectedStateException {
-    final ServerName serverName = serverNode.getServerName();
     synchronized (regionNode) {
       final RegionTransitionProcedure proc = regionNode.getProcedure();
       if (proc == null) return false;
@@ -954,8 +956,8 @@ public class AssignmentManager implements ServerListener {
             collect(Collectors.toList()));
     }
 
-    final ServerStateNode serverNode = regionStates.getOrCreateServer(serverName);
-
+    // Make sure there is a ServerStateNode for this server that just checked in.
+    ServerStateNode serverNode = regionStates.getOrCreateServer(serverName);
     synchronized (serverNode) {
       if (!serverNode.isInState(ServerState.ONLINE)) {
         LOG.warn("Got a report from a server result in state " + serverNode.getState());
@@ -968,7 +970,7 @@ public class AssignmentManager implements ServerListener {
       LOG.trace("no online region found on " + serverName);
     } else if (!isMetaLoaded()) {
       // if we are still on startup, discard the report unless is from someone holding meta
-      checkOnlineRegionsReportForMeta(serverNode, regionNames);
+      checkOnlineRegionsReportForMeta(serverName, regionNames);
     } else {
       // The Heartbeat updates us of what regions are only. check and verify the state.
       checkOnlineRegionsReport(serverNode, regionNames);
@@ -978,8 +980,7 @@ public class AssignmentManager implements ServerListener {
     wakeServerReportEvent(serverNode);
   }
 
-  void checkOnlineRegionsReportForMeta(final ServerStateNode serverNode,
-      final Set<byte[]> regionNames) {
+  void checkOnlineRegionsReportForMeta(final ServerName serverName, final Set<byte[]> regionNames) {
     try {
       for (byte[] regionName: regionNames) {
         final RegionInfo hri = getMetaRegionFromName(regionName);
@@ -993,18 +994,16 @@ public class AssignmentManager implements ServerListener {
 
         final RegionStateNode regionNode = regionStates.getOrCreateRegionStateNode(hri);
         LOG.info("META REPORTED: " + regionNode);
-        if (!reportTransition(regionNode, serverNode, TransitionCode.OPENED, 0)) {
-          LOG.warn("META REPORTED but no procedure found (complete?); set location=" +
-              serverNode.getServerName());
-          regionNode.setRegionLocation(serverNode.getServerName());
+        if (!reportTransition(regionNode, serverName, TransitionCode.OPENED, 0)) {
+          LOG.warn("META REPORTED but no procedure found (complete?); set location={}", serverName);
+          regionNode.setRegionLocation(serverName);
         } else if (LOG.isTraceEnabled()) {
           LOG.trace("META REPORTED: " + regionNode);
         }
       }
     } catch (UnexpectedStateException e) {
-      final ServerName serverName = serverNode.getServerName();
       LOG.warn("KILLING " + serverName + ": " + e.getMessage());
-      killRegionServer(serverNode);
+      killRegionServer(serverName);
     }
   }
 
@@ -1026,7 +1025,8 @@ public class AssignmentManager implements ServerListener {
                 " but state has otherwise.");
             } else if (regionNode.isInState(State.OPENING)) {
               try {
-                if (!reportTransition(regionNode, serverNode, TransitionCode.OPENED, 0)) {
+                if (!reportTransition(regionNode, serverNode.getServerName(),
+                    TransitionCode.OPENED, 0)) {
                   LOG.warn(regionNode.toString() + " reported OPEN on server=" + serverName +
                     " but state has otherwise AND NO procedure is running");
                 }
@@ -1048,17 +1048,18 @@ public class AssignmentManager implements ServerListener {
       }
     } catch (UnexpectedStateException e) {
       LOG.warn("Killing " + serverName + ": " + e.getMessage());
-      killRegionServer(serverNode);
+      killRegionServer(serverName);
       throw (YouAreDeadException)new YouAreDeadException(e.getMessage()).initCause(e);
     }
   }
 
   protected boolean waitServerReportEvent(ServerName serverName, Procedure<?> proc) {
-    final ServerStateNode serverNode = regionStates.getOrCreateServer(serverName);
-    if (serverNode == null) {
-      LOG.warn("serverName=null; {}", proc);
+    ServerStateNode ssn = this.regionStates.getServerNode(serverName);
+    if (ssn == null) {
+      LOG.warn("Why is ServerStateNode for {} empty at this point? Creating...", serverName);
+      ssn = this.regionStates.getOrCreateServer(serverName);
     }
-    return serverNode.getReportEvent().suspendIfNotReady(proc);
+    return ssn.getReportEvent().suspendIfNotReady(proc);
   }
 
   protected void wakeServerReportEvent(final ServerStateNode serverNode) {
@@ -1269,9 +1270,9 @@ public class AssignmentManager implements ServerListener {
             regionNode.setLastHost(lastHost);
             regionNode.setRegionLocation(regionLocation);
             regionNode.setOpenSeqNum(openSeqNum);
-
             if (localState == State.OPEN) {
               assert regionLocation != null : "found null region location for " + regionNode;
+              regionStates.getOrCreateServer(regionNode.getRegionLocation());
               regionStates.addRegionToServer(regionNode);
             } else if (localState == State.OFFLINE || regionInfo.isOffline()) {
               regionStates.addToOfflineRegions(regionNode);
@@ -1281,9 +1282,18 @@ public class AssignmentManager implements ServerListener {
               // The region is CLOSED and the table is DISABLED/ DISABLING, there is nothing to
               // schedule; the region is inert.
             } else {
-              // These regions should have a procedure in replay
+              // This is region in CLOSING or OPENING state.
+              // These regions should have a procedure in replay.
+              // If they don't, then they will show as STUCK after a while because of the below
+              // registration and will need intervention by operator to fix. Add them to a server
+              // even if the server is not around so a SCP cleans them up.
+              if (regionLocation != null) {
+                regionStates.getOrCreateServer(regionLocation);
+              }
               regionStates.addRegionInTransition(regionNode, null);
             }
+          } else {
+            LOG.info("RIT {}", regionNode);
           }
         }
       }
@@ -1821,8 +1831,8 @@ public class AssignmentManager implements ServerListener {
     wakeServerReportEvent(serverNode);
   }
 
-  private void killRegionServer(final ServerStateNode serverNode) {
-    master.getServerManager().expireServer(serverNode.getServerName());
+  private void killRegionServer(final ServerName serverName) {
+    master.getServerManager().expireServer(serverName);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hbase/blob/72af27b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java
index 40e82f9..8f6a232 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java
@@ -436,7 +436,8 @@ public class RegionStates {
 
     @Override
     public String toString() {
-      return String.format("ServerStateNode(%s)", getServerName());
+      return String.format("name=%s, state=%s, regionCount=%d", getServerName(), getState(),
+          getRegionCount());
     }
   }
 
@@ -1109,9 +1110,10 @@ public class RegionStates {
    * you could mess up online server accounting. TOOD: Review usage and convert
    * to {@link #getServerNode(ServerName)} where we can.
    */
-  public ServerStateNode getOrCreateServer(final ServerName serverName) {
+  ServerStateNode getOrCreateServer(final ServerName serverName) {
     ServerStateNode node = serverMap.get(serverName);
     if (node == null) {
+      LOG.trace("CREATING! {}", serverName, new RuntimeException("WHERE AM I?"));
       node = new ServerStateNode(serverName);
       ServerStateNode oldNode = serverMap.putIfAbsent(serverName, node);
       node = oldNode != null ? oldNode : node;
@@ -1123,7 +1125,7 @@ public class RegionStates {
     serverMap.remove(serverName);
   }
 
-  protected ServerStateNode getServerNode(final ServerName serverName) {
+  public ServerStateNode getServerNode(final ServerName serverName) {
     return serverMap.get(serverName);
   }
 
@@ -1137,10 +1139,18 @@ public class RegionStates {
     return numServers == 0 ? 0.0: (double)totalLoad / (double)numServers;
   }
 
-  public ServerStateNode addRegionToServer(final RegionStateNode regionNode) {
-    ServerStateNode serverNode = getOrCreateServer(regionNode.getRegionLocation());
-    serverNode.addRegion(regionNode);
-    return serverNode;
+  /**
+   * Add reference to region to serverstatenode.
+   * DOES NOT AUTO-CREATE ServerStateNode instance.
+   * @return Return serverstatenode or null if none.
+   */
+  ServerStateNode addRegionToServer(final RegionStateNode regionNode) {
+    ServerStateNode ssn = getServerNode(regionNode.getRegionLocation());
+    if (ssn == null) {
+      return ssn;
+    }
+    ssn.addRegion(regionNode);
+    return ssn;
   }
 
   public boolean isReplicaAvailableForRegion(final RegionInfo info) {
@@ -1165,8 +1175,10 @@ public class RegionStates {
 
   public ServerStateNode removeRegionFromServer(final ServerName serverName,
       final RegionStateNode regionNode) {
-    ServerStateNode serverNode = getOrCreateServer(serverName);
-    serverNode.removeRegion(regionNode);
+    ServerStateNode serverNode = getServerNode(serverName);
+    if (serverNode != null) {
+      serverNode.removeRegion(regionNode);
+    }
     return serverNode;
   }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/72af27b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
index 9f1dd38..c8b0b5c 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
@@ -241,7 +241,7 @@ public abstract class RegionTransitionProcedure
   public synchronized void remoteCallFailed(final MasterProcedureEnv env,
       final ServerName serverName, final IOException exception) {
     final RegionStateNode regionNode = getRegionState(env);
-    LOG.warn("Remote call failed {}; {}", regionNode.toShortString(), this, exception);
+    LOG.warn("Remote call failed {} {}", this, regionNode.toShortString(), exception);
     if (remoteCallFailed(env, regionNode, exception)) {
       // NOTE: This call to wakeEvent puts this Procedure back on the scheduler.
       // Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond
@@ -262,7 +262,7 @@ public abstract class RegionTransitionProcedure
    */
   protected boolean addToRemoteDispatcher(final MasterProcedureEnv env,
       final ServerName targetServer) {
-    LOG.info("Dispatch {}; {}", this, getRegionState(env).toShortString());
+    LOG.info("Dispatch {}", this);
 
     // Put this procedure into suspended mode to wait on report of state change
     // from remote regionserver. Means Procedure associated ProcedureEvent is marked not 'ready'.

http://git-wip-us.apache.org/repos/asf/hbase/blob/72af27b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
index 8a391da..589b732 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
@@ -137,8 +137,10 @@ public class UnassignProcedure extends RegionTransitionProcedure {
       throws IOException {
     UnassignRegionStateData.Builder state = UnassignRegionStateData.newBuilder()
         .setTransitionState(getTransitionState())
-        .setHostingServer(ProtobufUtil.toServerName(this.hostingServer))
         .setRegionInfo(ProtobufUtil.toRegionInfo(getRegionInfo()));
+    if (this.hostingServer != null) {
+      state.setHostingServer(ProtobufUtil.toServerName(this.hostingServer));
+    }
     if (this.destinationServer != null) {
       state.setDestinationServer(ProtobufUtil.toServerName(destinationServer));
     }
@@ -161,9 +163,10 @@ public class UnassignProcedure extends RegionTransitionProcedure {
         serializer.deserialize(UnassignRegionStateData.class);
     setTransitionState(state.getTransitionState());
     setRegionInfo(ProtobufUtil.toRegionInfo(state.getRegionInfo()));
-    this.hostingServer = ProtobufUtil.toServerName(state.getHostingServer());
     // The 'force' flag is the override flag in unassign.
     setOverride(state.getForce());
+    this.hostingServer =
+        state.hasHostingServer()? ProtobufUtil.toServerName(state.getHostingServer()): null;
     if (state.hasDestinationServer()) {
       this.destinationServer = ProtobufUtil.toServerName(state.getDestinationServer());
     }
@@ -259,6 +262,7 @@ public class UnassignProcedure extends RegionTransitionProcedure {
       // This exception comes from ServerCrashProcedure AFTER log splitting. Its a signaling
       // exception. SCP found this region as a RIT during its processing of the crash.  Its call
       // into here says it is ok to let this procedure go complete.
+      LOG.info("Safe to let procedure move to next step; {}", this);
       return true;
     }
     if (exception instanceof NotServingRegionException) {
@@ -333,6 +337,7 @@ public class UnassignProcedure extends RegionTransitionProcedure {
           // Return true; wake up the procedure so we can act on proceed.
           return true;
         }
+        LOG.info("Failed expiration and log splitting not done on {}", serverName);
       }
       // Return false so this procedure stays in suspended state. It will be woken up by the
       // ServerCrashProcedure that was scheduled when we called #expireServer above. SCP calls

http://git-wip-us.apache.org/repos/asf/hbase/blob/72af27b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionLocationFinder.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionLocationFinder.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionLocationFinder.java
index fb7731f..f944802 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionLocationFinder.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionLocationFinder.java
@@ -299,11 +299,14 @@ class RegionLocationFinder {
   }
 
   public void refreshAndWait(Collection<RegionInfo> hris) {
-    ArrayList<ListenableFuture<HDFSBlocksDistribution>> regionLocationFutures = new ArrayList<>(hris.size());
+    ArrayList<ListenableFuture<HDFSBlocksDistribution>> regionLocationFutures =
+        new ArrayList<>(hris.size());
     for (RegionInfo hregionInfo : hris) {
       regionLocationFutures.add(asyncGetBlockDistribution(hregionInfo));
     }
     int index = 0;
+    LOG.info("Refreshing block distribution cache for {} regions (Can take a while on big cluster)",
+        hris.size());
     for (RegionInfo hregionInfo : hris) {
       ListenableFuture<HDFSBlocksDistribution> future = regionLocationFutures
           .get(index);
@@ -318,6 +321,7 @@ class RegionLocationFinder {
       }
       index++;
     }
+    LOG.info("Finished refreshing block distribution cache for {} regions", hris.size());
   }
 
   // For test

http://git-wip-us.apache.org/repos/asf/hbase/blob/72af27b8/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterWALManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterWALManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterWALManager.java
new file mode 100644
index 0000000..4f8a240
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterWALManager.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import static junit.framework.TestCase.assertTrue;
+import static org.junit.Assert.assertFalse;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.SmallTests;
+import org.junit.Before;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.mockito.Mockito;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+
+@Category({MasterTests.class, SmallTests.class})
+public class TestMasterWALManager {
+  private static final Logger LOG = LoggerFactory.getLogger(TestMasterWALManager.class);
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+      HBaseClassTestRule.forClass(TestMasterWALManager.class);
+
+  private static final HBaseTestingUtility HTU = new HBaseTestingUtility();
+
+  private MasterWalManager mwm;
+  private MasterServices masterServices;
+
+  @Before
+  public void before() throws IOException {
+    MasterFileSystem mfs = Mockito.mock(MasterFileSystem.class);
+    Mockito.when(mfs.getWALFileSystem()).thenReturn(HTU.getTestFileSystem());
+    Path walRootDir = HTU.createWALRootDir();
+    Mockito.when(mfs.getWALRootDir()).thenReturn(walRootDir);
+    this.masterServices = Mockito.mock(MasterServices.class);
+    Mockito.when(this.masterServices.getConfiguration()).thenReturn(HTU.getConfiguration());
+    Mockito.when(this.masterServices.getMasterFileSystem()).thenReturn(mfs);
+    Mockito.when(this.masterServices.getServerName()).
+        thenReturn(ServerName.parseServerName("master.example.org,0123,456"));
+    this.mwm = new MasterWalManager(this.masterServices);
+  }
+
+  @Test
+  public void testIsWALDirectoryNameWithWALs() throws IOException {
+    ServerName sn = ServerName.parseServerName("x.example.org,1234,5678");
+    assertFalse(this.mwm.isWALDirectoryNameWithWALs(sn));
+    FileSystem walFS = this.masterServices.getMasterFileSystem().getWALFileSystem();
+    Path dir = new Path(this.mwm.getWALDirPath(), sn.toString());
+    assertTrue(walFS.mkdirs(dir));
+    assertTrue(this.mwm.isWALDirectoryNameWithWALs(sn));
+    // Make sure works when dir is SPLITTING
+    Set<ServerName> sns = new HashSet<ServerName>();
+    sns.add(sn);
+    List<Path> paths = this.mwm.createAndGetLogDirs(sns);
+    assertTrue(this.mwm.isWALDirectoryNameWithWALs(sn));
+  }
+}