You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by bh...@apache.org on 2020/03/31 23:36:40 UTC

[hbase] branch master updated: HBASE-24075: Fix a race between master shutdown and metrics (re)init

This is an automated email from the ASF dual-hosted git repository.

bharathv pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/master by this push:
     new 6f213e9  HBASE-24075: Fix a race between master shutdown and metrics (re)init
6f213e9 is described below

commit 6f213e9d5a15afcc40b6562ab4898f913cec91eb
Author: Bharath Vissapragada <bh...@apache.org>
AuthorDate: Tue Mar 31 00:16:15 2020 -0700

    HBASE-24075: Fix a race between master shutdown and metrics (re)init
    
    JMXCacheBuster resets the metrics state at various points in time. These
    events can potentially race with a master shutdown. When the master is
    tearing down, metrics initialization can touch a lot of unsafe state,
    for example invalidated FS objects. To avoid this, this patch makes
    the getMetrics() a no-op when the master is either stopped or in the
    process of shutting down. Additionally, getClusterId() when the server
    is shutting down is made a no-op.
    
    Simulating a test for this is a bit tricky but with the patch I don't
    locally see the long stacktraces from the jira.
    
    Signed-off-by: Michael Stack <st...@apache.org>
---
 .../hadoop/hbase/master/MetricsMasterWrapper.java     |  5 +++++
 .../hadoop/hbase/master/MetricsMasterSourceImpl.java  |  4 +++-
 .../apache/hadoop/hbase/master/CachedClusterId.java   | 19 +++++++++++++------
 .../java/org/apache/hadoop/hbase/master/HMaster.java  |  2 +-
 .../hadoop/hbase/master/MetricsMasterWrapperImpl.java |  4 ++++
 .../org/apache/hadoop/hbase/TestCachedClusterId.java  |  3 ++-
 6 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java
index 0c5ab59..77b357f 100644
--- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java
+++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java
@@ -31,6 +31,11 @@ import org.apache.yetus.audience.InterfaceAudience;
 public interface MetricsMasterWrapper {
 
   /**
+   * Returns if the master is currently running and is not attempting to shutdown.
+   */
+  boolean isRunning();
+
+  /**
    * Get ServerName
    */
   String getServerName();
diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java
index fc49a40..8163050 100644
--- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java
+++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java
@@ -83,7 +83,9 @@ public class MetricsMasterSourceImpl
     MetricsRecordBuilder metricsRecordBuilder = metricsCollector.addRecord(metricsName);
 
     // masterWrapper can be null because this function is called inside of init.
-    if (masterWrapper != null) {
+    // If the master is already stopped or has initiated a shutdown, no point in registering the
+    // metrics again.
+    if (masterWrapper != null && masterWrapper.isRunning()) {
 
       // Pair<online region number, offline region number>
       PairOfSameType<Integer> regionNumberPair = masterWrapper.getRegionCounts();
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CachedClusterId.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CachedClusterId.java
index 9ca7399..4bc444b 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CachedClusterId.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CachedClusterId.java
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.ClusterId;
+import org.apache.hadoop.hbase.Server;
 import org.apache.hadoop.hbase.util.FSUtils;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
@@ -46,8 +47,8 @@ public class CachedClusterId {
   public static final Logger LOG = LoggerFactory.getLogger(CachedClusterId.class);
   private static final int MAX_FETCH_TIMEOUT_MS = 10000;
 
-  private Path rootDir;
-  private FileSystem fs;
+  private final Path rootDir;
+  private final FileSystem fs;
 
   // When true, indicates that a FileSystem fetch of ClusterID is in progress. This is used to
   // avoid multiple fetches from FS and let only one thread fetch the information.
@@ -58,12 +59,15 @@ public class CachedClusterId {
   // Immutable once set and read multiple times.
   private ClusterId clusterId;
 
+  private final Server server;
+
   // cache stats for testing.
   private AtomicInteger cacheMisses = new AtomicInteger(0);
 
-  public CachedClusterId(Configuration conf) throws IOException {
-    rootDir = FSUtils.getRootDir(conf);
-    fs = rootDir.getFileSystem(conf);
+  public CachedClusterId(Server server, Configuration conf) throws IOException {
+    this.rootDir = FSUtils.getRootDir(conf);
+    this.fs = rootDir.getFileSystem(conf);
+    this.server = server;
   }
 
   /**
@@ -130,9 +134,12 @@ public class CachedClusterId {
    * trying get from a clean cache.
    *
    * @return ClusterId by reading from FileSystem or null in any error case or cluster ID does
-   *     not exist on the file system.
+   *     not exist on the file system or if the server initiated a tear down.
    */
   public String getFromCacheOrFetch() {
+    if (server.isStopping() || server.isStopped()) {
+      return null;
+    }
     String id = getClusterId();
     if (id != null) {
       return id;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index f244e76..7ed5ca6 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -586,7 +586,7 @@ public class HMaster extends HRegionServer implements MasterServices {
         this.metaRegionLocationCache = null;
         this.activeMasterManager = null;
       }
-      cachedClusterId = new CachedClusterId(conf);
+      cachedClusterId = new CachedClusterId(this, conf);
     } catch (Throwable t) {
       // Make sure we log the exception. HMaster is often started via reflection and the
       // cause of failed startup is lost.
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java
index 0f30ceb..caf4324 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java
@@ -134,6 +134,10 @@ public class MetricsMasterWrapperImpl implements MetricsMasterWrapper {
     return serverManager.getDeadServers().size();
   }
 
+  @Override public boolean isRunning() {
+    return !(master.isStopped() || master.isStopping());
+  }
+
   @Override
   public String getServerName() {
     ServerName serverName = master.getServerName();
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestCachedClusterId.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestCachedClusterId.java
index b8a4be8..c146685 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestCachedClusterId.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestCachedClusterId.java
@@ -76,7 +76,8 @@ public class TestCachedClusterId {
   @Test
   public void testMultiThreadedGetClusterId() throws Exception {
     Configuration conf = TEST_UTIL.getConfiguration();
-    CachedClusterId cachedClusterId = new CachedClusterId(conf);
+    CachedClusterId cachedClusterId = new CachedClusterId(TEST_UTIL.getHBaseCluster().getMaster(),
+      conf);
     TestContext context = new TestContext(conf);
     int numThreads = 16;
     for (int i = 0; i < numThreads; i++) {