You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by bh...@apache.org on 2021/03/26 00:12:00 UTC
[hbase] branch branch-2 updated: HBASE-25032 Wait for region server
to become online before adding it to online servers in Master (#2771)
This is an automated email from the ASF dual-hosted git repository.
bharathv pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2 by this push:
new 3bb9788 HBASE-25032 Wait for region server to become online before adding it to online servers in Master (#2771)
3bb9788 is described below
commit 3bb978894db2c7ce41010e403a83a8a9f5d1110e
Author: caroliney14 <ca...@berkeley.edu>
AuthorDate: Thu Mar 25 17:10:43 2021 -0700
HBASE-25032 Wait for region server to become online before adding it to online servers in Master (#2771)
Signed-off-by: Bharath Vissapragada <bh...@apache.org>
Signed-off-by: Michael Stack <st...@apache.org>
---
.../apache/hadoop/hbase/master/ServerManager.java | 8 ++---
.../hadoop/hbase/regionserver/HRegionServer.java | 34 ++++++++++++----------
.../org/apache/hadoop/hbase/MiniHBaseCluster.java | 3 +-
.../hbase/master/TestGetReplicationLoad.java | 3 +-
.../hadoop/hbase/master/TestMasterMetrics.java | 6 ----
.../TestCompactionInDeadRegionServer.java | 3 +-
.../regionserver/TestShutdownWhileWALBroken.java | 3 +-
7 files changed, 28 insertions(+), 32 deletions(-)
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index 6b9a0a5..064c2e6 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -179,7 +179,8 @@ public class ServerManager {
}
/**
- * Let the server manager know a new regionserver has come online
+ * Let the server manager know a regionserver is requesting configurations.
+ * Regionserver will not be added here, but in its first report.
* @param request the startup request
* @param versionNumber the version number of the new regionserver
* @param version the version of the new regionserver, could contain strings like "SNAPSHOT"
@@ -202,10 +203,6 @@ public class ServerManager {
ServerName sn = ServerName.valueOf(hostname, request.getPort(), request.getServerStartCode());
checkClockSkew(sn, request.getServerCurrentTime());
checkIsDead(sn, "STARTUP");
- if (!checkAndRecordNewServer(sn, ServerMetricsBuilder.of(sn, versionNumber, version))) {
- LOG.warn(
- "THIS SHOULD NOT HAPPEN, RegionServerStartup" + " could not record the server: " + sn);
- }
return sn;
}
@@ -256,7 +253,6 @@ public class ServerManager {
if (null == this.onlineServers.replace(sn, sl)) {
// Already have this host+port combo and its just different start code?
// Just let the server in. Presume master joining a running cluster.
- // recordNewServer is what happens at the end of reportServerStartup.
// The only thing we are skipping is passing back to the regionserver
// the ServerName to use. Here we presume a master has already done
// that so we'll press on with whatever it gave us for ServerName.
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index a8931de..d6981e2 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -1026,10 +1026,9 @@ public class HRegionServer extends Thread implements
// node was created, in case any coprocessors want to use ZooKeeper
this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
- // Try and register with the Master; tell it we are here. Break if server is stopped or
- // the clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and
- // start up all Services. Use RetryCounter to get backoff in case Master is struggling to
- // come up.
+ // Get configurations from the Master. Break if server is stopped or
+ // the clusterup flag is down or hdfs went wacky. Then start up all Services.
+ // Use RetryCounter to get backoff in case Master is struggling to come up.
LOG.debug("About to register with Master.");
RetryCounterFactory rcf =
new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(), 1000 * 60 * 5);
@@ -1062,7 +1061,7 @@ public class HRegionServer extends Thread implements
}
}
- // We registered with the Master. Go into run mode.
+ // Run mode.
long lastMsg = System.currentTimeMillis();
long oldRequestCount = -1;
// The main run loop.
@@ -1096,7 +1095,14 @@ public class HRegionServer extends Thread implements
}
long now = System.currentTimeMillis();
if ((now - lastMsg) >= msgInterval) {
- tryRegionServerReport(lastMsg, now);
+ // Register with the Master now that our setup is complete.
+ if (tryRegionServerReport(lastMsg, now) && !online.get()) {
+ // Wake up anyone waiting for this server to online
+ synchronized (online) {
+ online.set(true);
+ online.notifyAll();
+ }
+ }
lastMsg = System.currentTimeMillis();
}
if (!isStopped() && !isAborted()) {
@@ -1266,12 +1272,12 @@ public class HRegionServer extends Thread implements
}
@InterfaceAudience.Private
- protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
+ protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
throws IOException {
RegionServerStatusService.BlockingInterface rss = rssStub;
if (rss == null) {
// the current server could be stopping.
- return;
+ return false;
}
ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime, reportEndTime);
try {
@@ -1291,7 +1297,9 @@ public class HRegionServer extends Thread implements
// Couldn't connect to the master, get location from zk and reconnect
// Method blocks until new master is found or we are stopped
createRegionServerStatusStub(true);
+ return false;
}
+ return true;
}
/**
@@ -1654,11 +1662,6 @@ public class HRegionServer extends Thread implements
", sessionid=0x" +
Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId()));
- // Wake up anyone waiting for this server to online
- synchronized (online) {
- online.set(true);
- online.notifyAll();
- }
} catch (Throwable e) {
stop("Failed initialization");
throw convertThrowableToIOE(cleanup(e, "Failed init"),
@@ -2822,10 +2825,9 @@ public class HRegionServer extends Thread implements
}
/*
- * Let the master know we're here Run initialization using parameters passed
- * us by the master.
+ * Run initialization using parameters passed us by the master.
* @return A Map of key/value configurations we got from the Master else
- * null if we failed to register.
+ * null if we failed during report.
* @throws IOException
*/
private RegionServerStartupResponse reportForDuty() throws IOException {
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
index f795eef..89d1968 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
@@ -449,8 +449,9 @@ public class MiniHBaseCluster extends HBaseCluster {
ServerName rsServerName = t.getRegionServer().getServerName();
long start = System.currentTimeMillis();
- ClusterStatus clusterStatus = getClusterStatus();
+ ClusterStatus clusterStatus;
while ((System.currentTimeMillis() - start) < timeout) {
+ clusterStatus = getClusterStatus();
if (clusterStatus != null && clusterStatus.getServers().contains(rsServerName)) {
return t;
}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java
index ac9ae39..1fcd646 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java
@@ -58,8 +58,9 @@ public class TestGetReplicationLoad {
}
@Override
- protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
+ protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) {
// do nothing
+ return true;
}
}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java
index 5116933..4e9bf10 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java
@@ -60,12 +60,6 @@ public class TestMasterMetrics {
public MyMaster(Configuration conf) throws IOException, KeeperException, InterruptedException {
super(conf);
}
- @Override
- protected void tryRegionServerReport(
- long reportStartTime, long reportEndTime) {
- // do nothing
- }
-
}
@BeforeClass
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java
index 78042cc..9b5c523 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java
@@ -84,13 +84,14 @@ public class TestCompactionInDeadRegionServer {
}
@Override
- protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
+ protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
throws IOException {
try {
super.tryRegionServerReport(reportStartTime, reportEndTime);
} catch (YouAreDeadException e) {
// ignore, do not abort
}
+ return true;
}
}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java
index 30c954b..ffc3a19 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java
@@ -85,13 +85,14 @@ public class TestShutdownWhileWALBroken {
}
@Override
- protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
+ protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
throws IOException {
try {
super.tryRegionServerReport(reportStartTime, reportEndTime);
} catch (YouAreDeadException e) {
LOG.info("Caught YouAreDeadException, ignore", e);
}
+ return true;
}
@Override