You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by bh...@apache.org on 2021/03/26 04:41:19 UTC

[hbase] branch branch-2.3 updated: HBASE-25032 Wait for region server to become online before adding it to online servers in Master (#2771) (#3095)

This is an automated email from the ASF dual-hosted git repository.

bharathv pushed a commit to branch branch-2.3
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.3 by this push:
     new 4756678  HBASE-25032 Wait for region server to become online before adding it to online servers in Master (#2771) (#3095)
4756678 is described below

commit 475667837b6b04e4c0315575213160fe16a08e11
Author: Bharath Vissapragada <bh...@apache.org>
AuthorDate: Thu Mar 25 21:40:50 2021 -0700

    HBASE-25032 Wait for region server to become online before adding it to online servers in Master (#2771) (#3095)
    
    Co-authored-by: caroliney14 <ca...@berkeley.edu>
    
    Signed-off-by: Bharath Vissapragada <bh...@apache.org>
    Signed-off-by: Michael Stack <st...@apache.org>
    (cherry picked from commit 3bb978894db2c7ce41010e403a83a8a9f5d1110e)
---
 .../apache/hadoop/hbase/master/ServerManager.java  |  8 ++---
 .../hadoop/hbase/regionserver/HRegionServer.java   | 34 ++++++++++++----------
 .../org/apache/hadoop/hbase/MiniHBaseCluster.java  |  3 +-
 .../hbase/master/TestGetReplicationLoad.java       |  3 +-
 .../hadoop/hbase/master/TestMasterMetrics.java     |  6 ----
 .../TestCompactionInDeadRegionServer.java          |  3 +-
 .../regionserver/TestShutdownWhileWALBroken.java   |  3 +-
 7 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index 9dc3957..ef967fc 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -178,7 +178,8 @@ public class ServerManager {
   }
 
   /**
-   * Let the server manager know a new regionserver has come online
+   * Let the server manager know a regionserver is requesting configurations.
+   * Regionserver will not be added here, but in its first report.
    * @param request the startup request
    * @param versionNumber the version number of the new regionserver
    * @param version the version of the new regionserver, could contain strings like "SNAPSHOT"
@@ -201,10 +202,6 @@ public class ServerManager {
     ServerName sn = ServerName.valueOf(hostname, request.getPort(), request.getServerStartCode());
     checkClockSkew(sn, request.getServerCurrentTime());
     checkIsDead(sn, "STARTUP");
-    if (!checkAndRecordNewServer(sn, ServerMetricsBuilder.of(sn, versionNumber, version))) {
-      LOG.warn(
-        "THIS SHOULD NOT HAPPEN, RegionServerStartup" + " could not record the server: " + sn);
-    }
     return sn;
   }
 
@@ -256,7 +253,6 @@ public class ServerManager {
     if (null == this.onlineServers.replace(sn, sl)) {
       // Already have this host+port combo and its just different start code?
       // Just let the server in. Presume master joining a running cluster.
-      // recordNewServer is what happens at the end of reportServerStartup.
       // The only thing we are skipping is passing back to the regionserver
       // the ServerName to use. Here we presume a master has already done
       // that so we'll press on with whatever it gave us for ServerName.
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 0560c25..a82a56c 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -975,10 +975,9 @@ public class HRegionServer extends Thread implements
         // node was created, in case any coprocessors want to use ZooKeeper
         this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
 
-        // Try and register with the Master; tell it we are here.  Break if server is stopped or
-        // the clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and
-        // start up all Services. Use RetryCounter to get backoff in case Master is struggling to
-        // come up.
+        // Get configurations from the Master. Break if server is stopped or
+        // the clusterup flag is down or hdfs went wacky. Then start up all Services.
+        // Use RetryCounter to get backoff in case Master is struggling to come up.
         LOG.debug("About to register with Master.");
         RetryCounterFactory rcf =
           new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(), 1000 * 60 * 5);
@@ -1011,7 +1010,7 @@ public class HRegionServer extends Thread implements
         }
       }
 
-      // We registered with the Master.  Go into run mode.
+      // Run mode.
       long lastMsg = System.currentTimeMillis();
       long oldRequestCount = -1;
       // The main run loop.
@@ -1045,7 +1044,14 @@ public class HRegionServer extends Thread implements
         }
         long now = System.currentTimeMillis();
         if ((now - lastMsg) >= msgInterval) {
-          tryRegionServerReport(lastMsg, now);
+          // Register with the Master now that our setup is complete.
+          if (tryRegionServerReport(lastMsg, now) && !online.get()) {
+            // Wake up anyone waiting for this server to online
+            synchronized (online) {
+              online.set(true);
+              online.notifyAll();
+            }
+          }
           lastMsg = System.currentTimeMillis();
         }
         if (!isStopped() && !isAborted()) {
@@ -1215,12 +1221,12 @@ public class HRegionServer extends Thread implements
   }
 
   @VisibleForTesting
-  protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
+  protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
       throws IOException {
     RegionServerStatusService.BlockingInterface rss = rssStub;
     if (rss == null) {
       // the current server could be stopping.
-      return;
+      return false;
     }
     ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime, reportEndTime);
     try {
@@ -1240,7 +1246,9 @@ public class HRegionServer extends Thread implements
       // Couldn't connect to the master, get location from zk and reconnect
       // Method blocks until new master is found or we are stopped
       createRegionServerStatusStub(true);
+      return false;
     }
+    return true;
   }
 
   /**
@@ -1603,11 +1611,6 @@ public class HRegionServer extends Thread implements
           ", sessionid=0x" +
           Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId()));
 
-      // Wake up anyone waiting for this server to online
-      synchronized (online) {
-        online.set(true);
-        online.notifyAll();
-      }
     } catch (Throwable e) {
       stop("Failed initialization");
       throw convertThrowableToIOE(cleanup(e, "Failed init"),
@@ -2726,10 +2729,9 @@ public class HRegionServer extends Thread implements
   }
 
   /*
-   * Let the master know we're here Run initialization using parameters passed
-   * us by the master.
+   * Run initialization using parameters passed us by the master.
    * @return A Map of key/value configurations we got from the Master else
-   * null if we failed to register.
+   * null if we failed during report.
    * @throws IOException
    */
   private RegionServerStartupResponse reportForDuty() throws IOException {
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
index f795eef..89d1968 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
@@ -449,8 +449,9 @@ public class MiniHBaseCluster extends HBaseCluster {
     ServerName rsServerName = t.getRegionServer().getServerName();
 
     long start = System.currentTimeMillis();
-    ClusterStatus clusterStatus = getClusterStatus();
+    ClusterStatus clusterStatus;
     while ((System.currentTimeMillis() - start) < timeout) {
+      clusterStatus = getClusterStatus();
       if (clusterStatus != null && clusterStatus.getServers().contains(rsServerName)) {
         return t;
       }
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java
index ac9ae39..1fcd646 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java
@@ -58,8 +58,9 @@ public class TestGetReplicationLoad {
     }
 
     @Override
-    protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
+    protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) {
       // do nothing
+      return true;
     }
   }
 
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java
index 5116933..4e9bf10 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java
@@ -60,12 +60,6 @@ public class TestMasterMetrics {
     public MyMaster(Configuration conf) throws IOException, KeeperException, InterruptedException {
       super(conf);
     }
-    @Override
-    protected void tryRegionServerReport(
-        long reportStartTime, long reportEndTime) {
-      // do nothing
-    }
-
   }
 
   @BeforeClass
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java
index 78042cc..9b5c523 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java
@@ -84,13 +84,14 @@ public class TestCompactionInDeadRegionServer {
     }
 
     @Override
-    protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
+    protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
         throws IOException {
       try {
         super.tryRegionServerReport(reportStartTime, reportEndTime);
       } catch (YouAreDeadException e) {
         // ignore, do not abort
       }
+      return true;
     }
   }
 
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java
index 30c954b..ffc3a19 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java
@@ -85,13 +85,14 @@ public class TestShutdownWhileWALBroken {
     }
 
     @Override
-    protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
+    protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
         throws IOException {
       try {
         super.tryRegionServerReport(reportStartTime, reportEndTime);
       } catch (YouAreDeadException e) {
         LOG.info("Caught YouAreDeadException, ignore", e);
       }
+      return true;
     }
 
     @Override