You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by nd...@apache.org on 2015/06/18 00:37:51 UTC

hbase git commit: HBASE-13605 RegionStates should not keep its list of dead servers (Enis Soztutar)

Repository: hbase
Updated Branches:
  refs/heads/branch-1.1 635dae324 -> 070655958


HBASE-13605 RegionStates should not keep its list of dead servers (Enis Soztutar)


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/07065595
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/07065595
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/07065595

Branch: refs/heads/branch-1.1
Commit: 0706559581c388688f1b5f719bff20092632ffd1
Parents: 635dae3
Author: Nick Dimiduk <nd...@apache.org>
Authored: Wed Jun 17 15:35:44 2015 -0700
Committer: Nick Dimiduk <nd...@apache.org>
Committed: Wed Jun 17 15:35:44 2015 -0700

----------------------------------------------------------------------
 .../hadoop/hbase/master/AssignmentManager.java  |  4 +-
 .../hadoop/hbase/master/RegionStates.java       | 32 +-------
 .../hadoop/hbase/master/ServerManager.java      | 83 ++++++--------------
 .../master/TestAssignmentManagerOnCluster.java  | 18 +----
 4 files changed, 33 insertions(+), 104 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/07065595/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
index cd0e9d7..ebc3d01 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
@@ -799,7 +799,7 @@ public class AssignmentManager extends ZooKeeperListener {
         // we need to reset the last region server of the region.
         regionStates.setLastRegionServerOfRegion(sn, encodedName);
         // Make sure we know the server is dead.
-        if (!serverManager.isServerDead(sn)) {
+        if (!serverManager.isServerInDeadServersList(sn)) {
           serverManager.expireServer(sn);
         }
       }
@@ -3079,7 +3079,7 @@ public class AssignmentManager extends ZooKeeperListener {
       Set<ServerName> deadServers) throws IOException, KeeperException {
     if (deadServers != null && !deadServers.isEmpty()) {
       for (ServerName serverName: deadServers) {
-        if (!serverManager.isServerDead(serverName)) {
+        if (!serverManager.isServerInDeadServersList(serverName)) {
           serverManager.expireServer(serverName); // Let SSH do region re-assign
         }
       }

http://git-wip-us.apache.org/repos/asf/hbase/blob/07065595/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
index 78097ac..4348ad6 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
@@ -44,7 +44,6 @@ import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.TableStateManager;
 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
-import org.apache.hadoop.hbase.MetaTableAccessor;
 import org.apache.hadoop.hbase.master.RegionState.State;
 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
 import org.apache.hadoop.hbase.util.Bytes;
@@ -132,14 +131,6 @@ public class RegionStates {
     new HashMap<String, ServerName>();
 
   /**
-   * Map a host port pair string to the latest start code
-   * of a region server which is known to be dead. It is dead
-   * to us, but server manager may not know it yet.
-   */
-  private final HashMap<String, Long> deadServers =
-    new HashMap<String, Long>();
-
-  /**
    * Map a dead servers to the time when log split is done.
    * Since log splitting is not ordered, we have to remember
    * all processed instances. The map is cleaned up based
@@ -823,10 +814,6 @@ public class RegionStates {
    * If so, we should hold re-assign this region till SSH has split its wals.
    * Once logs are split, the last assignment of this region will be reset,
    * which means a null last assignment server is ok for re-assigning.
-   *
-   * A region server could be dead but we don't know it yet. We may
-   * think it's online falsely. Therefore if a server is online, we still
-   * need to confirm it reachable and having the expected start code.
    */
   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
     ServerName server = lastAssignments.get(encodedName);
@@ -836,24 +823,9 @@ public class RegionStates {
   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
     if (server == null) return false;
     if (serverManager.isServerOnline(server)) {
-      String hostAndPort = server.getHostAndPort();
-      long startCode = server.getStartcode();
-      Long deadCode = deadServers.get(hostAndPort);
-      if (deadCode == null || startCode > deadCode.longValue()) {
-        if (serverManager.isServerReachable(server)) {
-          return false;
-        }
-        // The size of deadServers won't grow unbounded.
-        deadServers.put(hostAndPort, Long.valueOf(startCode));
+      if (!serverManager.isServerDead(server)) {
+        return false;
       }
-      // Watch out! If the server is not dead, the region could
-      // remain unassigned. That's why ServerManager#isServerReachable
-      // should use some retry.
-      //
-      // We cache this info since it is very unlikely for that
-      // instance to come back up later on. We don't want to expire
-      // the server since we prefer to let it die naturally.
-      LOG.warn("Couldn't reach online server " + server);
     }
     // Now, we know it's dead. Check if it's processed
     return !processedServers.containsKey(server);

http://git-wip-us.apache.org/repos/asf/hbase/blob/07065595/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index 5c8bd34..b4d31e8 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -50,7 +50,6 @@ import org.apache.hadoop.hbase.ZooKeeperConnectionException;
 import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.client.ClusterConnection;
 import org.apache.hadoop.hbase.client.ConnectionFactory;
-import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
 import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer;
 import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
@@ -62,14 +61,12 @@ import org.apache.hadoop.hbase.protobuf.ResponseConverter;
 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionRequest;
 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionResponse;
-import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupRequest;
 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.RegionStoreSequenceIds;
 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
 import org.apache.hadoop.hbase.regionserver.HRegionServer;
 import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
-import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.Triple;
 import org.apache.hadoop.hbase.util.RetryCounter;
@@ -157,8 +154,6 @@ public class ServerManager {
   private final long maxSkew;
   private final long warningSkew;
 
-  private final RetryCounterFactory pingRetryCounterFactory;
-
   /**
    * Set of region servers which are dead but not processed immediately. If one
    * server died before master enables ServerShutdownHandler, the server will be
@@ -217,11 +212,6 @@ public class ServerManager {
     maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
     warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
     this.connection = connect ? (ClusterConnection)ConnectionFactory.createConnection(c) : null;
-    int pingMaxAttempts = Math.max(1, master.getConfiguration().getInt(
-      "hbase.master.maximum.ping.server.attempts", 10));
-    int pingSleepInterval = Math.max(1, master.getConfiguration().getInt(
-      "hbase.master.ping.server.retry.sleep.interval", 100));
-    this.pingRetryCounterFactory = new RetryCounterFactory(pingMaxAttempts, pingSleepInterval);
   }
 
   /**
@@ -744,8 +734,8 @@ public class ServerManager {
         " failed because no RPC connection found to this server");
       return RegionOpeningState.FAILED_OPENING;
     }
-    OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server, 
-      region, versionOfOfflineNode, favoredNodes, 
+    OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server,
+      region, versionOfOfflineNode, favoredNodes,
       (RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode()));
     try {
       OpenRegionResponse response = admin.openRegion(null, request);
@@ -842,7 +832,7 @@ public class ServerManager {
    * Contacts a region server and waits up to timeout ms
    * to close the region.  This bypasses the active hmaster.
    */
-  public static void closeRegionSilentlyAndWait(ClusterConnection connection, 
+  public static void closeRegionSilentlyAndWait(ClusterConnection connection,
     ServerName server, HRegionInfo region, long timeout) throws IOException, InterruptedException {
     AdminService.BlockingInterface rs = connection.getAdmin(server);
     try {
@@ -897,47 +887,6 @@ public class ServerManager {
     ProtobufUtil.mergeRegions(admin, region_a, region_b, forcible);
   }
 
-  /**
-   * Check if a region server is reachable and has the expected start code
-   */
-  public boolean isServerReachable(ServerName server) {
-    if (server == null) throw new NullPointerException("Passed server is null");
-
-    RetryCounter retryCounter = pingRetryCounterFactory.create();
-    while (retryCounter.shouldRetry()) {
-      synchronized (this.onlineServers) {
-        if (this.deadservers.isDeadServer(server)) {
-          return false;
-        }
-      }
-      try {
-        AdminService.BlockingInterface admin = getRsAdmin(server);
-        if (admin != null) {
-          ServerInfo info = ProtobufUtil.getServerInfo(admin);
-          return info != null && info.hasServerName()
-            && server.getStartcode() == info.getServerName().getStartCode();
-        }
-      } catch (RegionServerStoppedException | ServerNotRunningYetException e) {
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("Couldn't reach " + server, e);
-        }
-        break;
-      } catch (IOException ioe) {
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes() + " of "
-              + retryCounter.getMaxAttempts(), ioe);
-        }
-        try {
-          retryCounter.sleepUntilNextRetry();
-        } catch(InterruptedException ie) {
-          Thread.currentThread().interrupt();
-          break;
-        }
-      }
-    }
-    return false;
-  }
-
     /**
     * @param sn
     * @return Admin interface for the remote regionserver named <code>sn</code>
@@ -1096,10 +1045,30 @@ public class ServerManager {
    * not known to be dead either. it is simply not tracked by the
    * master any more, for example, a very old previous instance).
    */
+  public synchronized boolean isServerInDeadServersList(ServerName serverName) {
+    return (serverName == null || deadservers.isDeadServer(serverName)
+        || queuedDeadServers.contains(serverName)
+        || requeuedDeadServers.containsKey(serverName));
+  }
+
+  /**
+   * Check if a server is known to be dead.  A server can be online,
+   * or known to be dead, or unknown to this manager (i.e, not online,
+   * not known to be dead either. it is simply not tracked by the
+   * master any more, for example, a very old previous instance).
+   */
   public synchronized boolean isServerDead(ServerName serverName) {
-    return serverName == null || deadservers.isDeadServer(serverName)
-      || queuedDeadServers.contains(serverName)
-      || requeuedDeadServers.containsKey(serverName);
+    if (isServerInDeadServersList(serverName)) {
+      return true;
+    }
+
+    // we are not acquiring the lock
+    ServerName onlineServer = findServerWithSameHostnamePortWithLock(serverName);
+    if (onlineServer != null && serverName.getStartcode() < onlineServer.getStartcode()) {
+      return true;
+    }
+
+    return false;
   }
 
   public void shutdownCluster() {

http://git-wip-us.apache.org/repos/asf/hbase/blob/07065595/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
index e892ce7..fd4a89e 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
@@ -1028,18 +1028,12 @@ public class TestAssignmentManagerOnCluster {
       assertTrue(regionStates.isRegionOnline(hri));
       assertEquals(oldServerName, regionStates.getRegionServerOfRegion(hri));
 
-      // Try to unassign the dead region before SSH
-      am.unassign(hri, false);
-      // The region should be moved to offline since the server is dead
-      RegionState state = regionStates.getRegionState(hri);
-      assertTrue(state.isOffline());
-
       // Kill the hosting server, which doesn't have meta on it.
       cluster.killRegionServer(oldServerName);
       cluster.waitForRegionServerToStop(oldServerName, -1);
 
       ServerManager serverManager = master.getServerManager();
-      while (!serverManager.isServerDead(oldServerName)
+      while (!serverManager.isServerInDeadServersList(oldServerName)
           || serverManager.getDeadServers().areDeadServersInProgress()) {
         Thread.sleep(100);
       }
@@ -1093,7 +1087,7 @@ public class TestAssignmentManagerOnCluster {
       TEST_UTIL.waitFor(120000, 1000, new Waiter.Predicate<Exception>() {
         @Override
         public boolean evaluate() throws Exception {
-          return serverManager.isServerDead(serverName)
+          return serverManager.isServerInDeadServersList(serverName)
             && !serverManager.areDeadServersInProgress();
         }
       });
@@ -1159,12 +1153,6 @@ public class TestAssignmentManagerOnCluster {
       assertTrue(regionStates.isRegionOnline(hri));
       assertEquals(oldServerName, regionStates.getRegionServerOfRegion(hri));
 
-      // Try to unassign the dead region before SSH
-      am.unassign(hri, false);
-      // The region should be moved to offline since the server is dead
-      RegionState state = regionStates.getRegionState(hri);
-      assertTrue(state.isOffline());
-
       // Disable the table now.
       master.disableTable(hri.getTable());
 
@@ -1173,7 +1161,7 @@ public class TestAssignmentManagerOnCluster {
       cluster.waitForRegionServerToStop(oldServerName, -1);
 
       ServerManager serverManager = master.getServerManager();
-      while (!serverManager.isServerDead(oldServerName)
+      while (!serverManager.isServerInDeadServersList(oldServerName)
           || serverManager.getDeadServers().areDeadServersInProgress()) {
         Thread.sleep(100);
       }