You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zh...@apache.org on 2018/01/23 03:07:48 UTC
hbase git commit: HBASE-19838 Can not shutdown backup master cleanly
when it has already tried to become the active master
Repository: hbase
Updated Branches:
refs/heads/master 541f8ad8a -> 970636c5a
HBASE-19838 Can not shutdown backup master cleanly when it has already tried to become the active master
On Master@shutdown, close the shared Master connection to kill any
ongoing RPCs by hosted clients.
M hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
Call close ont the Master shared clusterconnection to kill any ongoing
rpcs.
M hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
Remove guts of close; we were closing the Masters connection....not
our responsibility.
Added unit test written by Duo Zhang which demonstrates the case where
Master will not go down.
Signed-off-by: zhangduo <zh...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/970636c5
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/970636c5
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/970636c5
Branch: refs/heads/master
Commit: 970636c5afbd1a12a998af3e8b0825f806bedeca
Parents: 541f8ad
Author: Michael Stack <st...@apache.org>
Authored: Mon Jan 22 14:44:16 2018 -0800
Committer: zhangduo <zh...@apache.org>
Committed: Tue Jan 23 11:07:00 2018 +0800
----------------------------------------------------------------------
.../org/apache/hadoop/hbase/master/HMaster.java | 21 +++-
.../hadoop/hbase/master/ServerManager.java | 16 +--
.../hbase/regionserver/HRegionServer.java | 3 +
.../hbase/master/TestShutdownBackupMaster.java | 108 +++++++++++++++++++
4 files changed, 132 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/970636c5/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index 09b18bc..88d8596 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -2605,7 +2605,7 @@ public class HMaster extends HRegionServer implements MasterServices {
}
@Override
- public void abort(final String msg, final Throwable t) {
+ public void abort(String reason, Throwable cause) {
if (isAborted() || isStopped()) {
return;
}
@@ -2614,8 +2614,9 @@ public class HMaster extends HRegionServer implements MasterServices {
LOG.error(HBaseMarkers.FATAL, "Master server abort: loaded coprocessors are: " +
getLoadedCoprocessors());
}
- if (t != null) {
- LOG.error(HBaseMarkers.FATAL, msg, t);
+ String msg = "***** ABORTING master " + this + ": " + reason + " *****";
+ if (cause != null) {
+ LOG.error(HBaseMarkers.FATAL, msg, cause);
} else {
LOG.error(HBaseMarkers.FATAL, msg);
}
@@ -2666,14 +2667,19 @@ public class HMaster extends HRegionServer implements MasterServices {
return rsFatals;
}
+ /**
+ * Shutdown the cluster.
+ * Master runs a coordinated stop of all RegionServers and then itself.
+ */
public void shutdown() throws IOException {
if (cpHost != null) {
cpHost.preShutdown();
}
-
+ // Tell the servermanager cluster is down.
if (this.serverManager != null) {
this.serverManager.shutdownCluster();
}
+ // Set the cluster down flag; broadcast across the cluster.
if (this.clusterStatusTracker != null){
try {
this.clusterStatusTracker.setClusterDown();
@@ -2681,6 +2687,13 @@ public class HMaster extends HRegionServer implements MasterServices {
LOG.error("ZooKeeper exception trying to set cluster as down in ZK", e);
}
}
+ // Shutdown our cluster connection. This will kill any hosted RPCs that might be going on;
+ // this is what we want especially if the Master is in startup phase doing call outs to
+ // hbase:meta, etc. when cluster is down. Without ths connection close, we'd have to wait on
+ // the rpc to timeout.
+ if (this.clusterConnection != null) {
+ this.clusterConnection.close();
+ }
}
public void stopMaster() throws IOException {
http://git-wip-us.apache.org/repos/asf/hbase/blob/970636c5/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index 3db6033..3c84bfb 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -200,10 +200,8 @@ public class ServerManager {
Configuration c = master.getConfiguration();
maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
- this.connection = connect ? master.getClusterConnection() : null;
- this.rpcControllerFactory = this.connection == null
- ? null
- : connection.getRpcControllerFactory();
+ this.connection = connect? master.getClusterConnection(): null;
+ this.rpcControllerFactory = this.connection == null? null: connection.getRpcControllerFactory();
}
/**
@@ -968,16 +966,10 @@ public class ServerManager {
}
/**
- * Stop the ServerManager. Currently closes the connection to the master.
+ * Stop the ServerManager.
*/
public void stop() {
- if (connection != null) {
- try {
- connection.close();
- } catch (IOException e) {
- LOG.error("Attempt to close connection to master failed", e);
- }
- }
+ // Nothing to do.
}
/**
http://git-wip-us.apache.org/repos/asf/hbase/blob/970636c5/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 036a5c2..3844415 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -251,6 +251,9 @@ public class HRegionServer extends HasThread implements
* Cluster connection to be shared by services.
* Initialized at server startup and closed when server shuts down.
* Clients must never close it explicitly.
+ * Clients hosted by this Server should make use of this clusterConnection rather than create
+ * their own; if they create their own, there is no way for the hosting server to shutdown
+ * ongoing client RPCs.
*/
protected ClusterConnection clusterConnection;
http://git-wip-us.apache.org/repos/asf/hbase/blob/970636c5/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java
new file mode 100644
index 0000000..02d7f2f
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import static org.junit.Assert.assertNotNull;
+
+import java.io.IOException;
+import java.util.concurrent.CountDownLatch;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.CategoryBasedTimeout;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.MiniHBaseCluster;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
+import org.apache.zookeeper.KeeperException;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.rules.TestRule;
+
+/**
+ * Test to confirm that we will not hang when stop a backup master which is trying to become the
+ * active master. See HBASE-19838
+ */
+@Category({ MasterTests.class, MediumTests.class })
+public class TestShutdownBackupMaster {
+ @Rule public final TestRule timeout = CategoryBasedTimeout.builder().withTimeout(this.getClass()).
+ withLookingForStuckThread(true).build();
+
+ private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
+
+ private static volatile CountDownLatch ARRIVE;
+
+ private static volatile CountDownLatch CONTINUE;
+
+ public static final class MockHMaster extends HMaster {
+
+ public MockHMaster(Configuration conf) throws IOException, KeeperException {
+ super(conf);
+ }
+
+ @Override
+ void initClusterSchemaService() throws IOException, InterruptedException {
+ if (ARRIVE != null) {
+ ARRIVE.countDown();
+ CONTINUE.await();
+ }
+ super.initClusterSchemaService();
+ }
+ }
+
+ @BeforeClass
+ public static void setUpBeforeClass() throws Exception {
+ UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, MockHMaster.class, HMaster.class);
+ UTIL.startMiniCluster(2, 2);
+ UTIL.waitUntilAllSystemRegionsAssigned();
+ }
+
+ @AfterClass
+ public static void tearDownAfterClass() throws Exception {
+ // make sure that we can stop the cluster cleanly
+ UTIL.shutdownMiniCluster();
+ }
+
+ @Test
+ public void testShutdownWhileBecomingActive() throws InterruptedException {
+ MiniHBaseCluster cluster = UTIL.getHBaseCluster();
+ HMaster activeMaster = null;
+ HMaster backupMaster = null;
+ for (MasterThread t : cluster.getMasterThreads()) {
+ if (t.getMaster().isActiveMaster()) {
+ activeMaster = t.getMaster();
+ } else {
+ backupMaster = t.getMaster();
+ }
+ }
+ assertNotNull(activeMaster);
+ assertNotNull(backupMaster);
+ ARRIVE = new CountDownLatch(1);
+ CONTINUE = new CountDownLatch(1);
+ activeMaster.abort("Aborting active master for test");
+ // wait until we arrive the initClusterSchemaService
+ ARRIVE.await();
+ // killall RSes
+ cluster.getRegionServerThreads().stream().map(t -> t.getRegionServer())
+ .forEachOrdered(rs -> rs.abort("Aborting RS for test"));
+ CONTINUE.countDown();
+ }
+}