You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by vj...@apache.org on 2020/05/13 16:43:34 UTC
[hbase] branch branch-2 updated: Revert "HBASE-24327 : Flaky connection in TestMasterShutdown#testMasterShutdo… (#1690)"
This is an automated email from the ASF dual-hosted git repository.
vjasani pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2 by this push:
new cb17d70 Revert "HBASE-24327 : Flaky connection in TestMasterShutdown#testMasterShutdo… (#1690)"
cb17d70 is described below
commit cb17d70226e1d3785f3a2f83467026feb0302092
Author: Viraj Jasani <vj...@apache.org>
AuthorDate: Wed May 13 22:12:58 2020 +0530
Revert "HBASE-24327 : Flaky connection in TestMasterShutdown#testMasterShutdo… (#1690)"
This reverts commit d9b60d3339ff3818eb0d55f20be6d20fb2f7b118.
---
.../hadoop/hbase/master/TestMasterShutdown.java | 74 ++++++++--------------
1 file changed, 28 insertions(+), 46 deletions(-)
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
index e99e932..7b3921e 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
@@ -22,8 +22,6 @@ import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import java.io.IOException;
import java.util.List;
-import java.util.concurrent.CompletableFuture;
-import java.util.concurrent.CompletionException;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterMetrics;
@@ -33,6 +31,7 @@ import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.LocalHBaseCluster;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.StartMiniClusterOption;
+import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
@@ -51,7 +50,7 @@ public class TestMasterShutdown {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
- HBaseClassTestRule.forClass(TestMasterShutdown.class);
+ HBaseClassTestRule.forClass(TestMasterShutdown.class);
private HBaseTestingUtility htu;
@@ -128,7 +127,7 @@ public class TestMasterShutdown {
public void testMasterShutdownBeforeStartingAnyRegionServer() throws Exception {
LocalHBaseCluster hbaseCluster = null;
try {
- htu = new HBaseTestingUtility(
+ htu = new HBaseTestingUtility(
createMasterShutdownBeforeStartingAnyRegionServerConfiguration());
// configure a cluster with
@@ -152,46 +151,19 @@ public class TestMasterShutdown {
hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), options.getNumMasters(),
options.getNumRegionServers(), options.getMasterClass(), options.getRsClass());
final MasterThread masterThread = hbaseCluster.getMasters().get(0);
-
masterThread.start();
- final CompletableFuture<Void> shutdownFuture = CompletableFuture.runAsync(() -> {
- // Switching to master registry exacerbated a race in the master bootstrap that can result
- // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because
- // the server manager in HMaster is not initialized by the time shutdown() RPC (below) is
- // made to the master. The suspected reason as to why it was uncommon before HBASE-18095
- // is because the connection creation with ZK registry is so slow that by then the server
- // manager is usually init'ed in time for the RPC to be made. For now, adding an explicit
- // wait() in the test, waiting for the server manager to become available.
- final long timeout = TimeUnit.MINUTES.toMillis(10);
- assertNotEquals("timeout waiting for server manager to become available.", -1,
- htu.waitFor(timeout, () -> masterThread.getMaster().getServerManager() != null));
-
- // Master has come up far enough that we can terminate it without creating a zombie.
- LOG.debug("Attempting to establish connection.");
- try {
- // HBASE-24327 : (Resolve Flaky connection issues)
- // shutdown() RPC can have flaky ZK connection issues.
- // e.g
- // ERROR [RpcServer.priority.RWQ.Fifo.read.handler=1,queue=1,port=53033]
- // master.HMaster(2878): ZooKeeper exception trying to set cluster as down in ZK
- // org.apache.zookeeper.KeeperException$SystemErrorException:
- // KeeperErrorCode = SystemError
- //
- // However, even when above flakes happen, shutdown call does get completed even if
- // RPC call has failure. Hence, subsequent retries will never succeed as HMaster is
- // already shutdown. Hence, it can fail. To resolve it, after making one shutdown()
- // call, we are ignoring IOException.
- htu.getConnection().getAdmin().shutdown();
- LOG.info("Shutdown RPC sent.");
- } catch (IOException | CompletionException e) {
- LOG.warn("Failed to establish connection.", e);
- } catch (Throwable e) {
- LOG.warn("Something unexpected happened.", e);
- throw new RuntimeException(e);
- }
- });
-
- shutdownFuture.join();
+ // Switching to master registry exacerbated a race in the master bootstrap that can result
+ // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because
+ // the server manager in HMaster is not initialized by the time shutdown() RPC (below) is
+ // made to the master. The suspected reason as to why it was uncommon before HBASE-18095
+ // is because the connection creation with ZK registry is so slow that by then the server
+ // manager is usually init'ed in time for the RPC to be made. For now, adding an explicit
+ // wait() in the test, waiting for the server manager to become available.
+ final long timeout = TimeUnit.MINUTES.toMillis(10);
+ assertNotEquals("Timeout waiting for server manager to become available.",
+ -1, Waiter.waitFor(htu.getConfiguration(), timeout,
+ () -> masterThread.getMaster().getServerManager() != null));
+ htu.getConnection().getAdmin().shutdown();
masterThread.join();
} finally {
if (hbaseCluster != null) {
@@ -214,9 +186,19 @@ public class TestMasterShutdown {
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
// don't need a long write pipeline for this test.
conf.setInt("dfs.replication", 1);
- // reduce client retries
- conf.setInt("hbase.client.retries.number", 3);
- // Recoverable ZK configs are tuned more aggressively
+ return conf;
+ }
+
+ /**
+ * Create a new {@link Configuration} based on {@code baseConf} that has ZooKeeper connection
+ * settings tuned very aggressively. The resulting client is used within a retry loop, so there's
+ * no value in having the client itself do the retries. We want to iterate on the base
+ * configuration because we're waiting for the mini-cluster to start and set it's ZK client port.
+ *
+ * @return a new, configured {@link Configuration} instance.
+ */
+ private static Configuration createResponsiveZkConfig(final Configuration baseConf) {
+ final Configuration conf = HBaseConfiguration.create(baseConf);
conf.setInt(ReadOnlyZKClient.RECOVERY_RETRY, 3);
conf.setInt(ReadOnlyZKClient.RECOVERY_RETRY_INTERVAL_MILLIS, 100);
return conf;