You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by nd...@apache.org on 2015/06/18 00:37:51 UTC
hbase git commit: HBASE-13605 RegionStates should not keep its list
of dead servers (Enis Soztutar)
Repository: hbase
Updated Branches:
refs/heads/branch-1.1 635dae324 -> 070655958
HBASE-13605 RegionStates should not keep its list of dead servers (Enis Soztutar)
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/07065595
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/07065595
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/07065595
Branch: refs/heads/branch-1.1
Commit: 0706559581c388688f1b5f719bff20092632ffd1
Parents: 635dae3
Author: Nick Dimiduk <nd...@apache.org>
Authored: Wed Jun 17 15:35:44 2015 -0700
Committer: Nick Dimiduk <nd...@apache.org>
Committed: Wed Jun 17 15:35:44 2015 -0700
----------------------------------------------------------------------
.../hadoop/hbase/master/AssignmentManager.java | 4 +-
.../hadoop/hbase/master/RegionStates.java | 32 +-------
.../hadoop/hbase/master/ServerManager.java | 83 ++++++--------------
.../master/TestAssignmentManagerOnCluster.java | 18 +----
4 files changed, 33 insertions(+), 104 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/07065595/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
index cd0e9d7..ebc3d01 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
@@ -799,7 +799,7 @@ public class AssignmentManager extends ZooKeeperListener {
// we need to reset the last region server of the region.
regionStates.setLastRegionServerOfRegion(sn, encodedName);
// Make sure we know the server is dead.
- if (!serverManager.isServerDead(sn)) {
+ if (!serverManager.isServerInDeadServersList(sn)) {
serverManager.expireServer(sn);
}
}
@@ -3079,7 +3079,7 @@ public class AssignmentManager extends ZooKeeperListener {
Set<ServerName> deadServers) throws IOException, KeeperException {
if (deadServers != null && !deadServers.isEmpty()) {
for (ServerName serverName: deadServers) {
- if (!serverManager.isServerDead(serverName)) {
+ if (!serverManager.isServerInDeadServersList(serverName)) {
serverManager.expireServer(serverName); // Let SSH do region re-assign
}
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/07065595/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
index 78097ac..4348ad6 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
@@ -44,7 +44,6 @@ import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableStateManager;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
-import org.apache.hadoop.hbase.MetaTableAccessor;
import org.apache.hadoop.hbase.master.RegionState.State;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
import org.apache.hadoop.hbase.util.Bytes;
@@ -132,14 +131,6 @@ public class RegionStates {
new HashMap<String, ServerName>();
/**
- * Map a host port pair string to the latest start code
- * of a region server which is known to be dead. It is dead
- * to us, but server manager may not know it yet.
- */
- private final HashMap<String, Long> deadServers =
- new HashMap<String, Long>();
-
- /**
* Map a dead servers to the time when log split is done.
* Since log splitting is not ordered, we have to remember
* all processed instances. The map is cleaned up based
@@ -823,10 +814,6 @@ public class RegionStates {
* If so, we should hold re-assign this region till SSH has split its wals.
* Once logs are split, the last assignment of this region will be reset,
* which means a null last assignment server is ok for re-assigning.
- *
- * A region server could be dead but we don't know it yet. We may
- * think it's online falsely. Therefore if a server is online, we still
- * need to confirm it reachable and having the expected start code.
*/
synchronized boolean wasRegionOnDeadServer(final String encodedName) {
ServerName server = lastAssignments.get(encodedName);
@@ -836,24 +823,9 @@ public class RegionStates {
synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
if (server == null) return false;
if (serverManager.isServerOnline(server)) {
- String hostAndPort = server.getHostAndPort();
- long startCode = server.getStartcode();
- Long deadCode = deadServers.get(hostAndPort);
- if (deadCode == null || startCode > deadCode.longValue()) {
- if (serverManager.isServerReachable(server)) {
- return false;
- }
- // The size of deadServers won't grow unbounded.
- deadServers.put(hostAndPort, Long.valueOf(startCode));
+ if (!serverManager.isServerDead(server)) {
+ return false;
}
- // Watch out! If the server is not dead, the region could
- // remain unassigned. That's why ServerManager#isServerReachable
- // should use some retry.
- //
- // We cache this info since it is very unlikely for that
- // instance to come back up later on. We don't want to expire
- // the server since we prefer to let it die naturally.
- LOG.warn("Couldn't reach online server " + server);
}
// Now, we know it's dead. Check if it's processed
return !processedServers.containsKey(server);
http://git-wip-us.apache.org/repos/asf/hbase/blob/07065595/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index 5c8bd34..b4d31e8 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -50,7 +50,6 @@ import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.client.ClusterConnection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
-import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
import org.apache.hadoop.hbase.client.RetriesExhaustedException;
import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer;
import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
@@ -62,14 +61,12 @@ import org.apache.hadoop.hbase.protobuf.ResponseConverter;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionRequest;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionResponse;
-import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupRequest;
import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.RegionStoreSequenceIds;
import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
-import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Triple;
import org.apache.hadoop.hbase.util.RetryCounter;
@@ -157,8 +154,6 @@ public class ServerManager {
private final long maxSkew;
private final long warningSkew;
- private final RetryCounterFactory pingRetryCounterFactory;
-
/**
* Set of region servers which are dead but not processed immediately. If one
* server died before master enables ServerShutdownHandler, the server will be
@@ -217,11 +212,6 @@ public class ServerManager {
maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
this.connection = connect ? (ClusterConnection)ConnectionFactory.createConnection(c) : null;
- int pingMaxAttempts = Math.max(1, master.getConfiguration().getInt(
- "hbase.master.maximum.ping.server.attempts", 10));
- int pingSleepInterval = Math.max(1, master.getConfiguration().getInt(
- "hbase.master.ping.server.retry.sleep.interval", 100));
- this.pingRetryCounterFactory = new RetryCounterFactory(pingMaxAttempts, pingSleepInterval);
}
/**
@@ -744,8 +734,8 @@ public class ServerManager {
" failed because no RPC connection found to this server");
return RegionOpeningState.FAILED_OPENING;
}
- OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server,
- region, versionOfOfflineNode, favoredNodes,
+ OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server,
+ region, versionOfOfflineNode, favoredNodes,
(RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode()));
try {
OpenRegionResponse response = admin.openRegion(null, request);
@@ -842,7 +832,7 @@ public class ServerManager {
* Contacts a region server and waits up to timeout ms
* to close the region. This bypasses the active hmaster.
*/
- public static void closeRegionSilentlyAndWait(ClusterConnection connection,
+ public static void closeRegionSilentlyAndWait(ClusterConnection connection,
ServerName server, HRegionInfo region, long timeout) throws IOException, InterruptedException {
AdminService.BlockingInterface rs = connection.getAdmin(server);
try {
@@ -897,47 +887,6 @@ public class ServerManager {
ProtobufUtil.mergeRegions(admin, region_a, region_b, forcible);
}
- /**
- * Check if a region server is reachable and has the expected start code
- */
- public boolean isServerReachable(ServerName server) {
- if (server == null) throw new NullPointerException("Passed server is null");
-
- RetryCounter retryCounter = pingRetryCounterFactory.create();
- while (retryCounter.shouldRetry()) {
- synchronized (this.onlineServers) {
- if (this.deadservers.isDeadServer(server)) {
- return false;
- }
- }
- try {
- AdminService.BlockingInterface admin = getRsAdmin(server);
- if (admin != null) {
- ServerInfo info = ProtobufUtil.getServerInfo(admin);
- return info != null && info.hasServerName()
- && server.getStartcode() == info.getServerName().getStartCode();
- }
- } catch (RegionServerStoppedException | ServerNotRunningYetException e) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Couldn't reach " + server, e);
- }
- break;
- } catch (IOException ioe) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes() + " of "
- + retryCounter.getMaxAttempts(), ioe);
- }
- try {
- retryCounter.sleepUntilNextRetry();
- } catch(InterruptedException ie) {
- Thread.currentThread().interrupt();
- break;
- }
- }
- }
- return false;
- }
-
/**
* @param sn
* @return Admin interface for the remote regionserver named <code>sn</code>
@@ -1096,10 +1045,30 @@ public class ServerManager {
* not known to be dead either. it is simply not tracked by the
* master any more, for example, a very old previous instance).
*/
+ public synchronized boolean isServerInDeadServersList(ServerName serverName) {
+ return (serverName == null || deadservers.isDeadServer(serverName)
+ || queuedDeadServers.contains(serverName)
+ || requeuedDeadServers.containsKey(serverName));
+ }
+
+ /**
+ * Check if a server is known to be dead. A server can be online,
+ * or known to be dead, or unknown to this manager (i.e, not online,
+ * not known to be dead either. it is simply not tracked by the
+ * master any more, for example, a very old previous instance).
+ */
public synchronized boolean isServerDead(ServerName serverName) {
- return serverName == null || deadservers.isDeadServer(serverName)
- || queuedDeadServers.contains(serverName)
- || requeuedDeadServers.containsKey(serverName);
+ if (isServerInDeadServersList(serverName)) {
+ return true;
+ }
+
+ // we are not acquiring the lock
+ ServerName onlineServer = findServerWithSameHostnamePortWithLock(serverName);
+ if (onlineServer != null && serverName.getStartcode() < onlineServer.getStartcode()) {
+ return true;
+ }
+
+ return false;
}
public void shutdownCluster() {
http://git-wip-us.apache.org/repos/asf/hbase/blob/07065595/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
index e892ce7..fd4a89e 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
@@ -1028,18 +1028,12 @@ public class TestAssignmentManagerOnCluster {
assertTrue(regionStates.isRegionOnline(hri));
assertEquals(oldServerName, regionStates.getRegionServerOfRegion(hri));
- // Try to unassign the dead region before SSH
- am.unassign(hri, false);
- // The region should be moved to offline since the server is dead
- RegionState state = regionStates.getRegionState(hri);
- assertTrue(state.isOffline());
-
// Kill the hosting server, which doesn't have meta on it.
cluster.killRegionServer(oldServerName);
cluster.waitForRegionServerToStop(oldServerName, -1);
ServerManager serverManager = master.getServerManager();
- while (!serverManager.isServerDead(oldServerName)
+ while (!serverManager.isServerInDeadServersList(oldServerName)
|| serverManager.getDeadServers().areDeadServersInProgress()) {
Thread.sleep(100);
}
@@ -1093,7 +1087,7 @@ public class TestAssignmentManagerOnCluster {
TEST_UTIL.waitFor(120000, 1000, new Waiter.Predicate<Exception>() {
@Override
public boolean evaluate() throws Exception {
- return serverManager.isServerDead(serverName)
+ return serverManager.isServerInDeadServersList(serverName)
&& !serverManager.areDeadServersInProgress();
}
});
@@ -1159,12 +1153,6 @@ public class TestAssignmentManagerOnCluster {
assertTrue(regionStates.isRegionOnline(hri));
assertEquals(oldServerName, regionStates.getRegionServerOfRegion(hri));
- // Try to unassign the dead region before SSH
- am.unassign(hri, false);
- // The region should be moved to offline since the server is dead
- RegionState state = regionStates.getRegionState(hri);
- assertTrue(state.isOffline());
-
// Disable the table now.
master.disableTable(hri.getTable());
@@ -1173,7 +1161,7 @@ public class TestAssignmentManagerOnCluster {
cluster.waitForRegionServerToStop(oldServerName, -1);
ServerManager serverManager = master.getServerManager();
- while (!serverManager.isServerDead(oldServerName)
+ while (!serverManager.isServerInDeadServersList(oldServerName)
|| serverManager.getDeadServers().areDeadServersInProgress()) {
Thread.sleep(100);
}