You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by en...@apache.org on 2015/06/18 20:58:02 UTC

hbase git commit: Revert "HBASE-13605 RegionStates should not keep its list of dead servers (Enis Soztutar)"

Repository: hbase
Updated Branches:
  refs/heads/branch-1.1 73ac8f378 -> 54dc45b8f


Revert "HBASE-13605 RegionStates should not keep its list of dead servers (Enis Soztutar)"

This reverts commit 0706559581c388688f1b5f719bff20092632ffd1.


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/54dc45b8
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/54dc45b8
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/54dc45b8

Branch: refs/heads/branch-1.1
Commit: 54dc45b8fe114aa3e63a345717bea748c760e95a
Parents: 73ac8f3
Author: Enis Soztutar <en...@apache.org>
Authored: Thu Jun 18 11:45:37 2015 -0700
Committer: Enis Soztutar <en...@apache.org>
Committed: Thu Jun 18 11:45:37 2015 -0700

----------------------------------------------------------------------
 .../hadoop/hbase/master/AssignmentManager.java  |  4 +-
 .../hadoop/hbase/master/RegionStates.java       | 32 +++++++-
 .../hadoop/hbase/master/ServerManager.java      | 83 ++++++++++++++------
 .../master/TestAssignmentManagerOnCluster.java  | 18 ++++-
 4 files changed, 104 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/54dc45b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
index ebc3d01..cd0e9d7 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
@@ -799,7 +799,7 @@ public class AssignmentManager extends ZooKeeperListener {
         // we need to reset the last region server of the region.
         regionStates.setLastRegionServerOfRegion(sn, encodedName);
         // Make sure we know the server is dead.
-        if (!serverManager.isServerInDeadServersList(sn)) {
+        if (!serverManager.isServerDead(sn)) {
           serverManager.expireServer(sn);
         }
       }
@@ -3079,7 +3079,7 @@ public class AssignmentManager extends ZooKeeperListener {
       Set<ServerName> deadServers) throws IOException, KeeperException {
     if (deadServers != null && !deadServers.isEmpty()) {
       for (ServerName serverName: deadServers) {
-        if (!serverManager.isServerInDeadServersList(serverName)) {
+        if (!serverManager.isServerDead(serverName)) {
           serverManager.expireServer(serverName); // Let SSH do region re-assign
         }
       }

http://git-wip-us.apache.org/repos/asf/hbase/blob/54dc45b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
index 4348ad6..78097ac 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
@@ -44,6 +44,7 @@ import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.TableStateManager;
 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
+import org.apache.hadoop.hbase.MetaTableAccessor;
 import org.apache.hadoop.hbase.master.RegionState.State;
 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
 import org.apache.hadoop.hbase.util.Bytes;
@@ -131,6 +132,14 @@ public class RegionStates {
     new HashMap<String, ServerName>();
 
   /**
+   * Map a host port pair string to the latest start code
+   * of a region server which is known to be dead. It is dead
+   * to us, but server manager may not know it yet.
+   */
+  private final HashMap<String, Long> deadServers =
+    new HashMap<String, Long>();
+
+  /**
    * Map a dead servers to the time when log split is done.
    * Since log splitting is not ordered, we have to remember
    * all processed instances. The map is cleaned up based
@@ -814,6 +823,10 @@ public class RegionStates {
    * If so, we should hold re-assign this region till SSH has split its wals.
    * Once logs are split, the last assignment of this region will be reset,
    * which means a null last assignment server is ok for re-assigning.
+   *
+   * A region server could be dead but we don't know it yet. We may
+   * think it's online falsely. Therefore if a server is online, we still
+   * need to confirm it reachable and having the expected start code.
    */
   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
     ServerName server = lastAssignments.get(encodedName);
@@ -823,9 +836,24 @@ public class RegionStates {
   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
     if (server == null) return false;
     if (serverManager.isServerOnline(server)) {
-      if (!serverManager.isServerDead(server)) {
-        return false;
+      String hostAndPort = server.getHostAndPort();
+      long startCode = server.getStartcode();
+      Long deadCode = deadServers.get(hostAndPort);
+      if (deadCode == null || startCode > deadCode.longValue()) {
+        if (serverManager.isServerReachable(server)) {
+          return false;
+        }
+        // The size of deadServers won't grow unbounded.
+        deadServers.put(hostAndPort, Long.valueOf(startCode));
       }
+      // Watch out! If the server is not dead, the region could
+      // remain unassigned. That's why ServerManager#isServerReachable
+      // should use some retry.
+      //
+      // We cache this info since it is very unlikely for that
+      // instance to come back up later on. We don't want to expire
+      // the server since we prefer to let it die naturally.
+      LOG.warn("Couldn't reach online server " + server);
     }
     // Now, we know it's dead. Check if it's processed
     return !processedServers.containsKey(server);

http://git-wip-us.apache.org/repos/asf/hbase/blob/54dc45b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index b4d31e8..5c8bd34 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -50,6 +50,7 @@ import org.apache.hadoop.hbase.ZooKeeperConnectionException;
 import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.client.ClusterConnection;
 import org.apache.hadoop.hbase.client.ConnectionFactory;
+import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
 import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer;
 import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
@@ -61,12 +62,14 @@ import org.apache.hadoop.hbase.protobuf.ResponseConverter;
 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionRequest;
 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionResponse;
+import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupRequest;
 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.RegionStoreSequenceIds;
 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
 import org.apache.hadoop.hbase.regionserver.HRegionServer;
 import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
+import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.Triple;
 import org.apache.hadoop.hbase.util.RetryCounter;
@@ -154,6 +157,8 @@ public class ServerManager {
   private final long maxSkew;
   private final long warningSkew;
 
+  private final RetryCounterFactory pingRetryCounterFactory;
+
   /**
    * Set of region servers which are dead but not processed immediately. If one
    * server died before master enables ServerShutdownHandler, the server will be
@@ -212,6 +217,11 @@ public class ServerManager {
     maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
     warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
     this.connection = connect ? (ClusterConnection)ConnectionFactory.createConnection(c) : null;
+    int pingMaxAttempts = Math.max(1, master.getConfiguration().getInt(
+      "hbase.master.maximum.ping.server.attempts", 10));
+    int pingSleepInterval = Math.max(1, master.getConfiguration().getInt(
+      "hbase.master.ping.server.retry.sleep.interval", 100));
+    this.pingRetryCounterFactory = new RetryCounterFactory(pingMaxAttempts, pingSleepInterval);
   }
 
   /**
@@ -734,8 +744,8 @@ public class ServerManager {
         " failed because no RPC connection found to this server");
       return RegionOpeningState.FAILED_OPENING;
     }
-    OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server,
-      region, versionOfOfflineNode, favoredNodes,
+    OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server, 
+      region, versionOfOfflineNode, favoredNodes, 
       (RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode()));
     try {
       OpenRegionResponse response = admin.openRegion(null, request);
@@ -832,7 +842,7 @@ public class ServerManager {
    * Contacts a region server and waits up to timeout ms
    * to close the region.  This bypasses the active hmaster.
    */
-  public static void closeRegionSilentlyAndWait(ClusterConnection connection,
+  public static void closeRegionSilentlyAndWait(ClusterConnection connection, 
     ServerName server, HRegionInfo region, long timeout) throws IOException, InterruptedException {
     AdminService.BlockingInterface rs = connection.getAdmin(server);
     try {
@@ -887,6 +897,47 @@ public class ServerManager {
     ProtobufUtil.mergeRegions(admin, region_a, region_b, forcible);
   }
 
+  /**
+   * Check if a region server is reachable and has the expected start code
+   */
+  public boolean isServerReachable(ServerName server) {
+    if (server == null) throw new NullPointerException("Passed server is null");
+
+    RetryCounter retryCounter = pingRetryCounterFactory.create();
+    while (retryCounter.shouldRetry()) {
+      synchronized (this.onlineServers) {
+        if (this.deadservers.isDeadServer(server)) {
+          return false;
+        }
+      }
+      try {
+        AdminService.BlockingInterface admin = getRsAdmin(server);
+        if (admin != null) {
+          ServerInfo info = ProtobufUtil.getServerInfo(admin);
+          return info != null && info.hasServerName()
+            && server.getStartcode() == info.getServerName().getStartCode();
+        }
+      } catch (RegionServerStoppedException | ServerNotRunningYetException e) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Couldn't reach " + server, e);
+        }
+        break;
+      } catch (IOException ioe) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes() + " of "
+              + retryCounter.getMaxAttempts(), ioe);
+        }
+        try {
+          retryCounter.sleepUntilNextRetry();
+        } catch(InterruptedException ie) {
+          Thread.currentThread().interrupt();
+          break;
+        }
+      }
+    }
+    return false;
+  }
+
     /**
     * @param sn
     * @return Admin interface for the remote regionserver named <code>sn</code>
@@ -1045,30 +1096,10 @@ public class ServerManager {
    * not known to be dead either. it is simply not tracked by the
    * master any more, for example, a very old previous instance).
    */
-  public synchronized boolean isServerInDeadServersList(ServerName serverName) {
-    return (serverName == null || deadservers.isDeadServer(serverName)
-        || queuedDeadServers.contains(serverName)
-        || requeuedDeadServers.containsKey(serverName));
-  }
-
-  /**
-   * Check if a server is known to be dead.  A server can be online,
-   * or known to be dead, or unknown to this manager (i.e, not online,
-   * not known to be dead either. it is simply not tracked by the
-   * master any more, for example, a very old previous instance).
-   */
   public synchronized boolean isServerDead(ServerName serverName) {
-    if (isServerInDeadServersList(serverName)) {
-      return true;
-    }
-
-    // we are not acquiring the lock
-    ServerName onlineServer = findServerWithSameHostnamePortWithLock(serverName);
-    if (onlineServer != null && serverName.getStartcode() < onlineServer.getStartcode()) {
-      return true;
-    }
-
-    return false;
+    return serverName == null || deadservers.isDeadServer(serverName)
+      || queuedDeadServers.contains(serverName)
+      || requeuedDeadServers.containsKey(serverName);
   }
 
   public void shutdownCluster() {

http://git-wip-us.apache.org/repos/asf/hbase/blob/54dc45b8/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
index fd4a89e..e892ce7 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
@@ -1028,12 +1028,18 @@ public class TestAssignmentManagerOnCluster {
       assertTrue(regionStates.isRegionOnline(hri));
       assertEquals(oldServerName, regionStates.getRegionServerOfRegion(hri));
 
+      // Try to unassign the dead region before SSH
+      am.unassign(hri, false);
+      // The region should be moved to offline since the server is dead
+      RegionState state = regionStates.getRegionState(hri);
+      assertTrue(state.isOffline());
+
       // Kill the hosting server, which doesn't have meta on it.
       cluster.killRegionServer(oldServerName);
       cluster.waitForRegionServerToStop(oldServerName, -1);
 
       ServerManager serverManager = master.getServerManager();
-      while (!serverManager.isServerInDeadServersList(oldServerName)
+      while (!serverManager.isServerDead(oldServerName)
           || serverManager.getDeadServers().areDeadServersInProgress()) {
         Thread.sleep(100);
       }
@@ -1087,7 +1093,7 @@ public class TestAssignmentManagerOnCluster {
       TEST_UTIL.waitFor(120000, 1000, new Waiter.Predicate<Exception>() {
         @Override
         public boolean evaluate() throws Exception {
-          return serverManager.isServerInDeadServersList(serverName)
+          return serverManager.isServerDead(serverName)
             && !serverManager.areDeadServersInProgress();
         }
       });
@@ -1153,6 +1159,12 @@ public class TestAssignmentManagerOnCluster {
       assertTrue(regionStates.isRegionOnline(hri));
       assertEquals(oldServerName, regionStates.getRegionServerOfRegion(hri));
 
+      // Try to unassign the dead region before SSH
+      am.unassign(hri, false);
+      // The region should be moved to offline since the server is dead
+      RegionState state = regionStates.getRegionState(hri);
+      assertTrue(state.isOffline());
+
       // Disable the table now.
       master.disableTable(hri.getTable());
 
@@ -1161,7 +1173,7 @@ public class TestAssignmentManagerOnCluster {
       cluster.waitForRegionServerToStop(oldServerName, -1);
 
       ServerManager serverManager = master.getServerManager();
-      while (!serverManager.isServerInDeadServersList(oldServerName)
+      while (!serverManager.isServerDead(oldServerName)
           || serverManager.getDeadServers().areDeadServersInProgress()) {
         Thread.sleep(100);
       }