You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by sy...@apache.org on 2017/05/15 02:14:36 UTC

hbase git commit: HBASE-18036 Data locality is not maintained after cluster restart or SSH (Stephen Yuan Jiang)

Repository: hbase
Updated Branches:
  refs/heads/branch-1.1 7d820db0e -> 26cb211e1


HBASE-18036 Data locality is not maintained after cluster restart or SSH (Stephen Yuan Jiang)


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/26cb211e
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/26cb211e
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/26cb211e

Branch: refs/heads/branch-1.1
Commit: 26cb211e1dc8f5011238de403087777965f0e16a
Parents: 7d820db
Author: Stephen Yuan Jiang <sy...@gmail.com>
Authored: Sun May 14 19:14:21 2017 -0700
Committer: Stephen Yuan Jiang <sy...@gmail.com>
Committed: Sun May 14 19:14:21 2017 -0700

----------------------------------------------------------------------
 .../hadoop/hbase/master/ServerManager.java      |  8 +++
 .../hbase/master/balancer/BaseLoadBalancer.java |  1 -
 .../master/handler/ServerShutdownHandler.java   | 27 ++++++++-
 .../hbase/master/TestAssignmentManager.java     | 62 ++++++++++++++++++++
 4 files changed, 94 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/26cb211e/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index 02ed231..7ccbb96 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -1102,6 +1102,14 @@ public class ServerManager {
   }
 
   /**
+   * Check whether a server is online based on hostname and port
+   * @return true if finding a server with matching hostname and port.
+   */
+  public boolean isServerWithSameHostnamePortOnline(final ServerName serverName) {
+    return findServerWithSameHostnamePortWithLock(serverName) != null;
+  }
+
+  /**
    * Check if a server is known to be dead.  A server can be online,
    * or known to be dead, or unknown to this manager (i.e, not online,
    * not known to be dead either. it is simply not tracked by the

http://git-wip-us.apache.org/repos/asf/hbase/blob/26cb211e/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
index 97afa49..75f1c47 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
@@ -44,7 +44,6 @@ import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.RegionLoad;
 import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
-import org.apache.hadoop.hbase.conf.ConfigurationObserver;
 import org.apache.hadoop.hbase.master.LoadBalancer;
 import org.apache.hadoop.hbase.master.MasterServices;
 import org.apache.hadoop.hbase.master.RackManager;

http://git-wip-us.apache.org/repos/asf/hbase/blob/26cb211e/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
index 163c942..445b2f0 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
@@ -21,7 +21,9 @@ package org.apache.hadoop.hbase.master.handler;
 import java.io.IOException;
 import java.io.InterruptedIOException;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.locks.Lock;
 
@@ -307,13 +309,32 @@ public class ServerShutdownHandler extends EventHandler {
         }
       }
 
+      // Determine what type of assignment to do if the dead server already restarted.
+      boolean retainAssignment =
+          (server.getConfiguration().getBoolean("hbase.master.retain.assignment", true) &&
+          serverManager.isServerWithSameHostnamePortOnline(serverName)) ? true : false;
+
       try {
-        am.assign(toAssignRegions);
+        if (retainAssignment) {
+          Map<HRegionInfo, ServerName> toAssignRegionsMap =
+              new HashMap<HRegionInfo, ServerName>(toAssignRegions.size());
+          for (HRegionInfo hri: toAssignRegions) {
+            toAssignRegionsMap.put(hri, serverName);
+          }
+          LOG.info("Best effort in SSH to retain assignment of " + toAssignRegions.size()
+            + " regions from the dead server " + serverName);
+          am.assign(toAssignRegionsMap);
+        } else {
+          LOG.info("Using round robin in SSH to assign " + toAssignRegions.size()
+          + " regions from the dead server " + serverName);
+          am.assign(toAssignRegions);
+        }
       } catch (InterruptedException ie) {
-        LOG.error("Caught " + ie + " during round-robin assignment");
+        LOG.error("Caught " + ie + " during " + (retainAssignment ? "retaining" : "round-robin")
+          + " assignment");
         throw (InterruptedIOException)new InterruptedIOException().initCause(ie);
       } catch (IOException ioe) {
-        LOG.info("Caught " + ioe + " during region assignment, will retry");
+        LOG.warn("Caught " + ioe + " during region assignment, will retry");
         // Only do wal splitting if shouldSplitWal and in DLR mode
         serverManager.processDeadServer(serverName,
           this.shouldSplitWal && distributedLogReplay);

http://git-wip-us.apache.org/repos/asf/hbase/blob/26cb211e/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java
index d585756..7bd3444 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java
@@ -112,6 +112,8 @@ public class TestAssignmentManager {
   private static final HBaseTestingUtility HTU = new HBaseTestingUtility();
   private static final ServerName SERVERNAME_A =
       ServerName.valueOf("example.org", 1234, 5678);
+  private static final ServerName SERVERNAME_AA =
+      ServerName.valueOf("example.org", 1234, 9999);
   private static final ServerName SERVERNAME_B =
       ServerName.valueOf("example.org", 0, 5678);
   private static final HRegionInfo REGIONINFO =
@@ -487,6 +489,33 @@ public class TestAssignmentManager {
   }
 
   /**
+   * Run a simple server shutdown handler after the same server restarts.
+   * @throws KeeperException
+   * @throws IOException
+   */
+  @Test (timeout=180000)
+  public void testShutdownHandlerWithRestartedServer()
+      throws KeeperException, IOException, CoordinatedStateException, ServiceException {
+    // Create and startup an executor.  This is used by AssignmentManager
+    // handling zk callbacks.
+    ExecutorService executor = startupMasterExecutor("testShutdownHandlerWithRestartedServer");
+
+    // Create an AM.
+    AssignmentManagerWithExtrasForTesting am =
+      setUpMockedAssignmentManager(this.server, this.serverManager);
+    am.getRegionStates().regionOnline(REGIONINFO, SERVERNAME_A);
+    am.getTableStateManager().setTableState(REGIONINFO.getTable(), Table.State.ENABLED);
+    try {
+      processServerShutdownHandler(am, false, true);
+    } finally {
+      executor.shutdown();
+      am.shutdown();
+      // Clean up all znodes
+      ZKAssign.deleteAllNodes(this.watcher);
+    }
+  }
+
+  /**
    * To test closed region handler to remove rit and delete corresponding znode
    * if region in pending close or closing while processing shutdown of a region
    * server.(HBASE-5927).
@@ -621,6 +650,12 @@ public class TestAssignmentManager {
 
   private void processServerShutdownHandler(AssignmentManager am, boolean splitRegion)
       throws IOException, ServiceException {
+    processServerShutdownHandler(am, splitRegion, false);
+  }
+
+  private void processServerShutdownHandler(
+      AssignmentManager am, boolean splitRegion, boolean deadserverRestarted)
+      throws IOException, ServiceException {
     // Make sure our new AM gets callbacks; once registered, can't unregister.
     // Thats ok because we make a new zk watcher for each test.
     this.watcher.registerListenerFirst(am);
@@ -676,6 +711,25 @@ public class TestAssignmentManager {
       // Have it that SERVERNAME_A died.
       DeadServer deadServers = new DeadServer();
       deadServers.add(SERVERNAME_A);
+      Mockito.when(this.serverManager.isServerReachable(SERVERNAME_B)).thenReturn(true);
+      Mockito.when(this.serverManager.isServerOnline(SERVERNAME_A)).thenReturn(false);
+      final Map<ServerName, ServerLoad> onlineServers = new HashMap<ServerName, ServerLoad>();
+      onlineServers.put(SERVERNAME_B, ServerLoad.EMPTY_SERVERLOAD);
+      if (deadserverRestarted) {
+        // Now make the same server (same host name and port) online again with a different
+        // start code.
+        Mockito.when(this.serverManager.isServerOnline(SERVERNAME_AA)).thenReturn(true);
+        Mockito.when(this.serverManager.isServerReachable(SERVERNAME_AA)).thenReturn(true);
+        Mockito.when(
+          this.serverManager.isServerWithSameHostnamePortOnline(SERVERNAME_A)).thenReturn(true);
+        onlineServers.put(SERVERNAME_AA, ServerLoad.EMPTY_SERVERLOAD);
+      }
+      Mockito.when(this.serverManager.getOnlineServersList()).thenReturn(
+          new ArrayList<ServerName>(onlineServers.keySet()));
+      Mockito.when(this.serverManager.getOnlineServers()).thenReturn(onlineServers);
+      List<ServerName> avServers = new ArrayList<ServerName>();
+      avServers.addAll(onlineServers.keySet());
+      Mockito.when(this.serverManager.createDestinationServersList()).thenReturn(avServers);
       // I need a services instance that will return the AM
       MasterFileSystem fs = Mockito.mock(MasterFileSystem.class);
       Mockito.doNothing().when(fs).setLogRecoveryMode();
@@ -1383,6 +1437,14 @@ public class TestAssignmentManager {
     }
 
     @Override
+    public void assign(Map<HRegionInfo, ServerName> regionServerMap)
+        throws IOException, InterruptedException {
+      assignInvoked = (regionServerMap != null && regionServerMap.size() > 0);
+      super.assign(regionServerMap);
+      this.gate.set(true);
+    }
+
+    @Override
     public void assign(List<HRegionInfo> regions)
         throws IOException, InterruptedException {
       assignInvoked = (regions != null && regions.size() > 0);