You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by vj...@apache.org on 2020/05/12 21:12:01 UTC
[hbase] branch branch-2.3 updated: HBASE-24327 : Flaky connection in TestMasterShutdown#testMasterShutdo… (#1690)
This is an automated email from the ASF dual-hosted git repository.
vjasani pushed a commit to branch branch-2.3
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.3 by this push:
new 8f245aa HBASE-24327 : Flaky connection in TestMasterShutdown#testMasterShutdo… (#1690)
8f245aa is described below
commit 8f245aa8f84456c9ba06727ad16634879a4f4265
Author: Viraj Jasani <vj...@apache.org>
AuthorDate: Wed May 13 02:38:47 2020 +0530
HBASE-24327 : Flaky connection in TestMasterShutdown#testMasterShutdo… (#1690)
Signed-off-by: Bharath Vissapragada <bh...@apache.org>
---
.../hadoop/hbase/master/TestMasterShutdown.java | 74 ++++++++++++++--------
1 file changed, 46 insertions(+), 28 deletions(-)
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
index 7b3921e..e99e932 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
@@ -22,6 +22,8 @@ import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import java.io.IOException;
import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CompletionException;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterMetrics;
@@ -31,7 +33,6 @@ import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.LocalHBaseCluster;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.StartMiniClusterOption;
-import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
@@ -50,7 +51,7 @@ public class TestMasterShutdown {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
- HBaseClassTestRule.forClass(TestMasterShutdown.class);
+ HBaseClassTestRule.forClass(TestMasterShutdown.class);
private HBaseTestingUtility htu;
@@ -127,7 +128,7 @@ public class TestMasterShutdown {
public void testMasterShutdownBeforeStartingAnyRegionServer() throws Exception {
LocalHBaseCluster hbaseCluster = null;
try {
- htu = new HBaseTestingUtility(
+ htu = new HBaseTestingUtility(
createMasterShutdownBeforeStartingAnyRegionServerConfiguration());
// configure a cluster with
@@ -151,19 +152,46 @@ public class TestMasterShutdown {
hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), options.getNumMasters(),
options.getNumRegionServers(), options.getMasterClass(), options.getRsClass());
final MasterThread masterThread = hbaseCluster.getMasters().get(0);
+
masterThread.start();
- // Switching to master registry exacerbated a race in the master bootstrap that can result
- // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because
- // the server manager in HMaster is not initialized by the time shutdown() RPC (below) is
- // made to the master. The suspected reason as to why it was uncommon before HBASE-18095
- // is because the connection creation with ZK registry is so slow that by then the server
- // manager is usually init'ed in time for the RPC to be made. For now, adding an explicit
- // wait() in the test, waiting for the server manager to become available.
- final long timeout = TimeUnit.MINUTES.toMillis(10);
- assertNotEquals("Timeout waiting for server manager to become available.",
- -1, Waiter.waitFor(htu.getConfiguration(), timeout,
- () -> masterThread.getMaster().getServerManager() != null));
- htu.getConnection().getAdmin().shutdown();
+ final CompletableFuture<Void> shutdownFuture = CompletableFuture.runAsync(() -> {
+ // Switching to master registry exacerbated a race in the master bootstrap that can result
+ // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because
+ // the server manager in HMaster is not initialized by the time shutdown() RPC (below) is
+ // made to the master. The suspected reason as to why it was uncommon before HBASE-18095
+ // is because the connection creation with ZK registry is so slow that by then the server
+ // manager is usually init'ed in time for the RPC to be made. For now, adding an explicit
+ // wait() in the test, waiting for the server manager to become available.
+ final long timeout = TimeUnit.MINUTES.toMillis(10);
+ assertNotEquals("timeout waiting for server manager to become available.", -1,
+ htu.waitFor(timeout, () -> masterThread.getMaster().getServerManager() != null));
+
+ // Master has come up far enough that we can terminate it without creating a zombie.
+ LOG.debug("Attempting to establish connection.");
+ try {
+ // HBASE-24327 : (Resolve Flaky connection issues)
+ // shutdown() RPC can have flaky ZK connection issues.
+ // e.g
+ // ERROR [RpcServer.priority.RWQ.Fifo.read.handler=1,queue=1,port=53033]
+ // master.HMaster(2878): ZooKeeper exception trying to set cluster as down in ZK
+ // org.apache.zookeeper.KeeperException$SystemErrorException:
+ // KeeperErrorCode = SystemError
+ //
+ // However, even when above flakes happen, shutdown call does get completed even if
+ // RPC call has failure. Hence, subsequent retries will never succeed as HMaster is
+ // already shutdown. Hence, it can fail. To resolve it, after making one shutdown()
+ // call, we are ignoring IOException.
+ htu.getConnection().getAdmin().shutdown();
+ LOG.info("Shutdown RPC sent.");
+ } catch (IOException | CompletionException e) {
+ LOG.warn("Failed to establish connection.", e);
+ } catch (Throwable e) {
+ LOG.warn("Something unexpected happened.", e);
+ throw new RuntimeException(e);
+ }
+ });
+
+ shutdownFuture.join();
masterThread.join();
} finally {
if (hbaseCluster != null) {
@@ -186,19 +214,9 @@ public class TestMasterShutdown {
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
// don't need a long write pipeline for this test.
conf.setInt("dfs.replication", 1);
- return conf;
- }
-
- /**
- * Create a new {@link Configuration} based on {@code baseConf} that has ZooKeeper connection
- * settings tuned very aggressively. The resulting client is used within a retry loop, so there's
- * no value in having the client itself do the retries. We want to iterate on the base
- * configuration because we're waiting for the mini-cluster to start and set it's ZK client port.
- *
- * @return a new, configured {@link Configuration} instance.
- */
- private static Configuration createResponsiveZkConfig(final Configuration baseConf) {
- final Configuration conf = HBaseConfiguration.create(baseConf);
+ // reduce client retries
+ conf.setInt("hbase.client.retries.number", 3);
+ // Recoverable ZK configs are tuned more aggressively
conf.setInt(ReadOnlyZKClient.RECOVERY_RETRY, 3);
conf.setInt(ReadOnlyZKClient.RECOVERY_RETRY_INTERVAL_MILLIS, 100);
return conf;