You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by en...@apache.org on 2015/06/18 20:58:02 UTC
hbase git commit: Revert "HBASE-13605 RegionStates should not keep
its list of dead servers (Enis Soztutar)"
Repository: hbase
Updated Branches:
refs/heads/branch-1.1 73ac8f378 -> 54dc45b8f
Revert "HBASE-13605 RegionStates should not keep its list of dead servers (Enis Soztutar)"
This reverts commit 0706559581c388688f1b5f719bff20092632ffd1.
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/54dc45b8
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/54dc45b8
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/54dc45b8
Branch: refs/heads/branch-1.1
Commit: 54dc45b8fe114aa3e63a345717bea748c760e95a
Parents: 73ac8f3
Author: Enis Soztutar <en...@apache.org>
Authored: Thu Jun 18 11:45:37 2015 -0700
Committer: Enis Soztutar <en...@apache.org>
Committed: Thu Jun 18 11:45:37 2015 -0700
----------------------------------------------------------------------
.../hadoop/hbase/master/AssignmentManager.java | 4 +-
.../hadoop/hbase/master/RegionStates.java | 32 +++++++-
.../hadoop/hbase/master/ServerManager.java | 83 ++++++++++++++------
.../master/TestAssignmentManagerOnCluster.java | 18 ++++-
4 files changed, 104 insertions(+), 33 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/54dc45b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
index ebc3d01..cd0e9d7 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
@@ -799,7 +799,7 @@ public class AssignmentManager extends ZooKeeperListener {
// we need to reset the last region server of the region.
regionStates.setLastRegionServerOfRegion(sn, encodedName);
// Make sure we know the server is dead.
- if (!serverManager.isServerInDeadServersList(sn)) {
+ if (!serverManager.isServerDead(sn)) {
serverManager.expireServer(sn);
}
}
@@ -3079,7 +3079,7 @@ public class AssignmentManager extends ZooKeeperListener {
Set<ServerName> deadServers) throws IOException, KeeperException {
if (deadServers != null && !deadServers.isEmpty()) {
for (ServerName serverName: deadServers) {
- if (!serverManager.isServerInDeadServersList(serverName)) {
+ if (!serverManager.isServerDead(serverName)) {
serverManager.expireServer(serverName); // Let SSH do region re-assign
}
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/54dc45b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
index 4348ad6..78097ac 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
@@ -44,6 +44,7 @@ import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableStateManager;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
+import org.apache.hadoop.hbase.MetaTableAccessor;
import org.apache.hadoop.hbase.master.RegionState.State;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
import org.apache.hadoop.hbase.util.Bytes;
@@ -131,6 +132,14 @@ public class RegionStates {
new HashMap<String, ServerName>();
/**
+ * Map a host port pair string to the latest start code
+ * of a region server which is known to be dead. It is dead
+ * to us, but server manager may not know it yet.
+ */
+ private final HashMap<String, Long> deadServers =
+ new HashMap<String, Long>();
+
+ /**
* Map a dead servers to the time when log split is done.
* Since log splitting is not ordered, we have to remember
* all processed instances. The map is cleaned up based
@@ -814,6 +823,10 @@ public class RegionStates {
* If so, we should hold re-assign this region till SSH has split its wals.
* Once logs are split, the last assignment of this region will be reset,
* which means a null last assignment server is ok for re-assigning.
+ *
+ * A region server could be dead but we don't know it yet. We may
+ * think it's online falsely. Therefore if a server is online, we still
+ * need to confirm it reachable and having the expected start code.
*/
synchronized boolean wasRegionOnDeadServer(final String encodedName) {
ServerName server = lastAssignments.get(encodedName);
@@ -823,9 +836,24 @@ public class RegionStates {
synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
if (server == null) return false;
if (serverManager.isServerOnline(server)) {
- if (!serverManager.isServerDead(server)) {
- return false;
+ String hostAndPort = server.getHostAndPort();
+ long startCode = server.getStartcode();
+ Long deadCode = deadServers.get(hostAndPort);
+ if (deadCode == null || startCode > deadCode.longValue()) {
+ if (serverManager.isServerReachable(server)) {
+ return false;
+ }
+ // The size of deadServers won't grow unbounded.
+ deadServers.put(hostAndPort, Long.valueOf(startCode));
}
+ // Watch out! If the server is not dead, the region could
+ // remain unassigned. That's why ServerManager#isServerReachable
+ // should use some retry.
+ //
+ // We cache this info since it is very unlikely for that
+ // instance to come back up later on. We don't want to expire
+ // the server since we prefer to let it die naturally.
+ LOG.warn("Couldn't reach online server " + server);
}
// Now, we know it's dead. Check if it's processed
return !processedServers.containsKey(server);
http://git-wip-us.apache.org/repos/asf/hbase/blob/54dc45b8/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index b4d31e8..5c8bd34 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -50,6 +50,7 @@ import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.client.ClusterConnection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
+import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
import org.apache.hadoop.hbase.client.RetriesExhaustedException;
import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer;
import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
@@ -61,12 +62,14 @@ import org.apache.hadoop.hbase.protobuf.ResponseConverter;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionRequest;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionResponse;
+import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupRequest;
import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.RegionStoreSequenceIds;
import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
+import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Triple;
import org.apache.hadoop.hbase.util.RetryCounter;
@@ -154,6 +157,8 @@ public class ServerManager {
private final long maxSkew;
private final long warningSkew;
+ private final RetryCounterFactory pingRetryCounterFactory;
+
/**
* Set of region servers which are dead but not processed immediately. If one
* server died before master enables ServerShutdownHandler, the server will be
@@ -212,6 +217,11 @@ public class ServerManager {
maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
this.connection = connect ? (ClusterConnection)ConnectionFactory.createConnection(c) : null;
+ int pingMaxAttempts = Math.max(1, master.getConfiguration().getInt(
+ "hbase.master.maximum.ping.server.attempts", 10));
+ int pingSleepInterval = Math.max(1, master.getConfiguration().getInt(
+ "hbase.master.ping.server.retry.sleep.interval", 100));
+ this.pingRetryCounterFactory = new RetryCounterFactory(pingMaxAttempts, pingSleepInterval);
}
/**
@@ -734,8 +744,8 @@ public class ServerManager {
" failed because no RPC connection found to this server");
return RegionOpeningState.FAILED_OPENING;
}
- OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server,
- region, versionOfOfflineNode, favoredNodes,
+ OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server,
+ region, versionOfOfflineNode, favoredNodes,
(RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode()));
try {
OpenRegionResponse response = admin.openRegion(null, request);
@@ -832,7 +842,7 @@ public class ServerManager {
* Contacts a region server and waits up to timeout ms
* to close the region. This bypasses the active hmaster.
*/
- public static void closeRegionSilentlyAndWait(ClusterConnection connection,
+ public static void closeRegionSilentlyAndWait(ClusterConnection connection,
ServerName server, HRegionInfo region, long timeout) throws IOException, InterruptedException {
AdminService.BlockingInterface rs = connection.getAdmin(server);
try {
@@ -887,6 +897,47 @@ public class ServerManager {
ProtobufUtil.mergeRegions(admin, region_a, region_b, forcible);
}
+ /**
+ * Check if a region server is reachable and has the expected start code
+ */
+ public boolean isServerReachable(ServerName server) {
+ if (server == null) throw new NullPointerException("Passed server is null");
+
+ RetryCounter retryCounter = pingRetryCounterFactory.create();
+ while (retryCounter.shouldRetry()) {
+ synchronized (this.onlineServers) {
+ if (this.deadservers.isDeadServer(server)) {
+ return false;
+ }
+ }
+ try {
+ AdminService.BlockingInterface admin = getRsAdmin(server);
+ if (admin != null) {
+ ServerInfo info = ProtobufUtil.getServerInfo(admin);
+ return info != null && info.hasServerName()
+ && server.getStartcode() == info.getServerName().getStartCode();
+ }
+ } catch (RegionServerStoppedException | ServerNotRunningYetException e) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Couldn't reach " + server, e);
+ }
+ break;
+ } catch (IOException ioe) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes() + " of "
+ + retryCounter.getMaxAttempts(), ioe);
+ }
+ try {
+ retryCounter.sleepUntilNextRetry();
+ } catch(InterruptedException ie) {
+ Thread.currentThread().interrupt();
+ break;
+ }
+ }
+ }
+ return false;
+ }
+
/**
* @param sn
* @return Admin interface for the remote regionserver named <code>sn</code>
@@ -1045,30 +1096,10 @@ public class ServerManager {
* not known to be dead either. it is simply not tracked by the
* master any more, for example, a very old previous instance).
*/
- public synchronized boolean isServerInDeadServersList(ServerName serverName) {
- return (serverName == null || deadservers.isDeadServer(serverName)
- || queuedDeadServers.contains(serverName)
- || requeuedDeadServers.containsKey(serverName));
- }
-
- /**
- * Check if a server is known to be dead. A server can be online,
- * or known to be dead, or unknown to this manager (i.e, not online,
- * not known to be dead either. it is simply not tracked by the
- * master any more, for example, a very old previous instance).
- */
public synchronized boolean isServerDead(ServerName serverName) {
- if (isServerInDeadServersList(serverName)) {
- return true;
- }
-
- // we are not acquiring the lock
- ServerName onlineServer = findServerWithSameHostnamePortWithLock(serverName);
- if (onlineServer != null && serverName.getStartcode() < onlineServer.getStartcode()) {
- return true;
- }
-
- return false;
+ return serverName == null || deadservers.isDeadServer(serverName)
+ || queuedDeadServers.contains(serverName)
+ || requeuedDeadServers.containsKey(serverName);
}
public void shutdownCluster() {
http://git-wip-us.apache.org/repos/asf/hbase/blob/54dc45b8/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
index fd4a89e..e892ce7 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
@@ -1028,12 +1028,18 @@ public class TestAssignmentManagerOnCluster {
assertTrue(regionStates.isRegionOnline(hri));
assertEquals(oldServerName, regionStates.getRegionServerOfRegion(hri));
+ // Try to unassign the dead region before SSH
+ am.unassign(hri, false);
+ // The region should be moved to offline since the server is dead
+ RegionState state = regionStates.getRegionState(hri);
+ assertTrue(state.isOffline());
+
// Kill the hosting server, which doesn't have meta on it.
cluster.killRegionServer(oldServerName);
cluster.waitForRegionServerToStop(oldServerName, -1);
ServerManager serverManager = master.getServerManager();
- while (!serverManager.isServerInDeadServersList(oldServerName)
+ while (!serverManager.isServerDead(oldServerName)
|| serverManager.getDeadServers().areDeadServersInProgress()) {
Thread.sleep(100);
}
@@ -1087,7 +1093,7 @@ public class TestAssignmentManagerOnCluster {
TEST_UTIL.waitFor(120000, 1000, new Waiter.Predicate<Exception>() {
@Override
public boolean evaluate() throws Exception {
- return serverManager.isServerInDeadServersList(serverName)
+ return serverManager.isServerDead(serverName)
&& !serverManager.areDeadServersInProgress();
}
});
@@ -1153,6 +1159,12 @@ public class TestAssignmentManagerOnCluster {
assertTrue(regionStates.isRegionOnline(hri));
assertEquals(oldServerName, regionStates.getRegionServerOfRegion(hri));
+ // Try to unassign the dead region before SSH
+ am.unassign(hri, false);
+ // The region should be moved to offline since the server is dead
+ RegionState state = regionStates.getRegionState(hri);
+ assertTrue(state.isOffline());
+
// Disable the table now.
master.disableTable(hri.getTable());
@@ -1161,7 +1173,7 @@ public class TestAssignmentManagerOnCluster {
cluster.waitForRegionServerToStop(oldServerName, -1);
ServerManager serverManager = master.getServerManager();
- while (!serverManager.isServerInDeadServersList(oldServerName)
+ while (!serverManager.isServerDead(oldServerName)
|| serverManager.getDeadServers().areDeadServersInProgress()) {
Thread.sleep(100);
}