You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by bh...@apache.org on 2020/03/31 23:36:40 UTC
[hbase] branch master updated: HBASE-24075: Fix a race between
master shutdown and metrics (re)init
This is an automated email from the ASF dual-hosted git repository.
bharathv pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/master by this push:
new 6f213e9 HBASE-24075: Fix a race between master shutdown and metrics (re)init
6f213e9 is described below
commit 6f213e9d5a15afcc40b6562ab4898f913cec91eb
Author: Bharath Vissapragada <bh...@apache.org>
AuthorDate: Tue Mar 31 00:16:15 2020 -0700
HBASE-24075: Fix a race between master shutdown and metrics (re)init
JMXCacheBuster resets the metrics state at various points in time. These
events can potentially race with a master shutdown. When the master is
tearing down, metrics initialization can touch a lot of unsafe state,
for example invalidated FS objects. To avoid this, this patch makes
the getMetrics() a no-op when the master is either stopped or in the
process of shutting down. Additionally, getClusterId() when the server
is shutting down is made a no-op.
Simulating a test for this is a bit tricky but with the patch I don't
locally see the long stacktraces from the jira.
Signed-off-by: Michael Stack <st...@apache.org>
---
.../hadoop/hbase/master/MetricsMasterWrapper.java | 5 +++++
.../hadoop/hbase/master/MetricsMasterSourceImpl.java | 4 +++-
.../apache/hadoop/hbase/master/CachedClusterId.java | 19 +++++++++++++------
.../java/org/apache/hadoop/hbase/master/HMaster.java | 2 +-
.../hadoop/hbase/master/MetricsMasterWrapperImpl.java | 4 ++++
.../org/apache/hadoop/hbase/TestCachedClusterId.java | 3 ++-
6 files changed, 28 insertions(+), 9 deletions(-)
diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java
index 0c5ab59..77b357f 100644
--- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java
+++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java
@@ -31,6 +31,11 @@ import org.apache.yetus.audience.InterfaceAudience;
public interface MetricsMasterWrapper {
/**
+ * Returns if the master is currently running and is not attempting to shutdown.
+ */
+ boolean isRunning();
+
+ /**
* Get ServerName
*/
String getServerName();
diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java
index fc49a40..8163050 100644
--- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java
+++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java
@@ -83,7 +83,9 @@ public class MetricsMasterSourceImpl
MetricsRecordBuilder metricsRecordBuilder = metricsCollector.addRecord(metricsName);
// masterWrapper can be null because this function is called inside of init.
- if (masterWrapper != null) {
+ // If the master is already stopped or has initiated a shutdown, no point in registering the
+ // metrics again.
+ if (masterWrapper != null && masterWrapper.isRunning()) {
// Pair<online region number, offline region number>
PairOfSameType<Integer> regionNumberPair = masterWrapper.getRegionCounts();
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CachedClusterId.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CachedClusterId.java
index 9ca7399..4bc444b 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CachedClusterId.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CachedClusterId.java
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.ClusterId;
+import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
@@ -46,8 +47,8 @@ public class CachedClusterId {
public static final Logger LOG = LoggerFactory.getLogger(CachedClusterId.class);
private static final int MAX_FETCH_TIMEOUT_MS = 10000;
- private Path rootDir;
- private FileSystem fs;
+ private final Path rootDir;
+ private final FileSystem fs;
// When true, indicates that a FileSystem fetch of ClusterID is in progress. This is used to
// avoid multiple fetches from FS and let only one thread fetch the information.
@@ -58,12 +59,15 @@ public class CachedClusterId {
// Immutable once set and read multiple times.
private ClusterId clusterId;
+ private final Server server;
+
// cache stats for testing.
private AtomicInteger cacheMisses = new AtomicInteger(0);
- public CachedClusterId(Configuration conf) throws IOException {
- rootDir = FSUtils.getRootDir(conf);
- fs = rootDir.getFileSystem(conf);
+ public CachedClusterId(Server server, Configuration conf) throws IOException {
+ this.rootDir = FSUtils.getRootDir(conf);
+ this.fs = rootDir.getFileSystem(conf);
+ this.server = server;
}
/**
@@ -130,9 +134,12 @@ public class CachedClusterId {
* trying get from a clean cache.
*
* @return ClusterId by reading from FileSystem or null in any error case or cluster ID does
- * not exist on the file system.
+ * not exist on the file system or if the server initiated a tear down.
*/
public String getFromCacheOrFetch() {
+ if (server.isStopping() || server.isStopped()) {
+ return null;
+ }
String id = getClusterId();
if (id != null) {
return id;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index f244e76..7ed5ca6 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -586,7 +586,7 @@ public class HMaster extends HRegionServer implements MasterServices {
this.metaRegionLocationCache = null;
this.activeMasterManager = null;
}
- cachedClusterId = new CachedClusterId(conf);
+ cachedClusterId = new CachedClusterId(this, conf);
} catch (Throwable t) {
// Make sure we log the exception. HMaster is often started via reflection and the
// cause of failed startup is lost.
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java
index 0f30ceb..caf4324 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java
@@ -134,6 +134,10 @@ public class MetricsMasterWrapperImpl implements MetricsMasterWrapper {
return serverManager.getDeadServers().size();
}
+ @Override public boolean isRunning() {
+ return !(master.isStopped() || master.isStopping());
+ }
+
@Override
public String getServerName() {
ServerName serverName = master.getServerName();
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestCachedClusterId.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestCachedClusterId.java
index b8a4be8..c146685 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestCachedClusterId.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestCachedClusterId.java
@@ -76,7 +76,8 @@ public class TestCachedClusterId {
@Test
public void testMultiThreadedGetClusterId() throws Exception {
Configuration conf = TEST_UTIL.getConfiguration();
- CachedClusterId cachedClusterId = new CachedClusterId(conf);
+ CachedClusterId cachedClusterId = new CachedClusterId(TEST_UTIL.getHBaseCluster().getMaster(),
+ conf);
TestContext context = new TestContext(conf);
int numThreads = 16;
for (int i = 0; i < numThreads; i++) {