You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2018/01/11 22:25:34 UTC
hbase git commit: HBASE-19694 The initialization order for a fresh
cluster is incorrect
Repository: hbase
Updated Branches:
refs/heads/branch-2 25e4bf8f3 -> 1a11fc92b
HBASE-19694 The initialization order for a fresh cluster is incorrect
Become active Master before calling the super class's run method. Have
the wait-on-becoming-active-Master be in-line rather than off in a
background thread (i.e. undo running thread in startActiveMasterManager)
Purge the fragile HBASE-16367 hackery that attempted to fix this issue
previously by adding a latch to try and hold up superclass RegionServer
until cluster id set by subclass Master.
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/1a11fc92
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/1a11fc92
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/1a11fc92
Branch: refs/heads/branch-2
Commit: 1a11fc92b16835808848b054d30b210d5572e492
Parents: 25e4bf8
Author: Michael Stack <st...@apache.org>
Authored: Tue Jan 9 12:49:39 2018 -0800
Committer: Michael Stack <st...@apache.org>
Committed: Thu Jan 11 14:25:25 2018 -0800
----------------------------------------------------------------------
.../org/apache/hadoop/hbase/master/HMaster.java | 122 ++++++++++---------
.../hbase/regionserver/HRegionServer.java | 7 --
.../hbase/master/TestTableStateManager.java | 2 +-
.../regionserver/TestPerColumnFamilyFlush.java | 1 -
4 files changed, 64 insertions(+), 68 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/1a11fc92/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index 945f54d..ee7cd18 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -40,7 +40,6 @@ import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
-import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
@@ -524,12 +523,9 @@ public class HMaster extends HRegionServer implements MasterServices {
// Some unit tests don't need a cluster, so no zookeeper at all
if (!conf.getBoolean("hbase.testing.nocluster", false)) {
- setInitLatch(new CountDownLatch(1));
- activeMasterManager = new ActiveMasterManager(zooKeeper, this.serverName, this);
- int infoPort = putUpJettyServer();
- startActiveMasterManager(infoPort);
+ this.activeMasterManager = new ActiveMasterManager(zooKeeper, this.serverName, this);
} else {
- activeMasterManager = null;
+ this.activeMasterManager = null;
}
} catch (Throwable t) {
// Make sure we log the exception. HMaster is often started via reflection and the
@@ -539,10 +535,27 @@ public class HMaster extends HRegionServer implements MasterServices {
}
}
- // Main run loop. Calls through to the regionserver run loop.
+ // Main run loop. Calls through to the regionserver run loop AFTER becoming active Master; will
+ // block in here until then.
@Override
public void run() {
try {
+ if (!conf.getBoolean("hbase.testing.nocluster", false)) {
+ try {
+ int infoPort = putUpJettyServer();
+ startActiveMasterManager(infoPort);
+ } catch (Throwable t) {
+ // Make sure we log the exception.
+ String error = "Failed to become Active Master";
+ LOG.error(error, t);
+ // Abort should have been called already.
+ if (!isAborted()) {
+ abort(error, t);
+ }
+ }
+ }
+ // Fall in here even if we have been aborted. Need to run the shutdown services and
+ // the super run call will do this for us.
super.run();
} finally {
if (this.clusterSchemaService != null) {
@@ -757,9 +770,9 @@ public class HMaster extends HRegionServer implements MasterServices {
private void finishActiveMasterInitialization(MonitoredTask status)
throws IOException, InterruptedException, KeeperException, CoordinatedStateException {
- activeMaster = true;
Thread zombieDetector = new Thread(new InitializationMonitor(this),
"ActiveMasterInitializationMonitor-" + System.currentTimeMillis());
+ zombieDetector.setDaemon(true);
zombieDetector.start();
/*
@@ -783,10 +796,9 @@ public class HMaster extends HRegionServer implements MasterServices {
this.tableDescriptors.getAll();
}
- // publish cluster ID
+ // Publish cluster ID
status.setStatus("Publishing Cluster ID in ZooKeeper");
ZKClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId());
- this.initLatch.countDown();
this.serverManager = createServerManager(this);
@@ -795,6 +807,10 @@ public class HMaster extends HRegionServer implements MasterServices {
status.setStatus("Initializing ZK system trackers");
initializeZKBasedSystemTrackers();
+ // Set Master as active now after we've setup zk with stuff like whether cluster is up or not.
+ // RegionServers won't come up if the cluster status is not up.
+ this.activeMaster = true;
+
// This is for backwards compatibility
// See HBASE-11393
status.setStatus("Update TableCFs node in ZNode");
@@ -818,7 +834,9 @@ public class HMaster extends HRegionServer implements MasterServices {
// Wake up this server to check in
sleeper.skipSleepCycle();
- // Wait for region servers to report in
+ // Wait for region servers to report in.
+ // With this as part of master initialization, it precludes our being able to start a single
+ // server that is both Master and RegionServer. Needs more thought. TODO.
String statusStr = "Wait for region servers to report in";
status.setStatus(statusStr);
LOG.info(Objects.toString(status));
@@ -1985,57 +2003,43 @@ public class HMaster extends HRegionServer implements MasterServices {
* this node for us since it is ephemeral.
*/
LOG.info("Adding backup master ZNode " + backupZNode);
- if (!MasterAddressTracker.setMasterAddress(zooKeeper, backupZNode,
- serverName, infoPort)) {
+ if (!MasterAddressTracker.setMasterAddress(zooKeeper, backupZNode, serverName, infoPort)) {
LOG.warn("Failed create of " + backupZNode + " by " + serverName);
}
-
- activeMasterManager.setInfoPort(infoPort);
- // Start a thread to try to become the active master, so we won't block here
- Threads.setDaemonThreadRunning(new Thread(new Runnable() {
- @Override
- public void run() {
- int timeout = conf.getInt(HConstants.ZK_SESSION_TIMEOUT,
- HConstants.DEFAULT_ZK_SESSION_TIMEOUT);
- // If we're a backup master, stall until a primary to writes his address
- if (conf.getBoolean(HConstants.MASTER_TYPE_BACKUP,
- HConstants.DEFAULT_MASTER_TYPE_BACKUP)) {
- LOG.debug("HMaster started in backup mode. "
- + "Stalling until master znode is written.");
- // This will only be a minute or so while the cluster starts up,
- // so don't worry about setting watches on the parent znode
- while (!activeMasterManager.hasActiveMaster()) {
- LOG.debug("Waiting for master address ZNode to be written "
- + "(Also watching cluster state node)");
- Threads.sleep(timeout);
- }
- }
- MonitoredTask status = TaskMonitor.get().createStatus("Master startup");
- status.setDescription("Master startup");
- try {
- if (activeMasterManager.blockUntilBecomingActiveMaster(timeout, status)) {
- finishActiveMasterInitialization(status);
- }
- } catch (Throwable t) {
- status.setStatus("Failed to become active: " + t.getMessage());
- LOG.error(HBaseMarkers.FATAL, "Failed to become active master", t);
- // HBASE-5680: Likely hadoop23 vs hadoop 20.x/1.x incompatibility
- if (t instanceof NoClassDefFoundError &&
- t.getMessage()
- .contains("org/apache/hadoop/hdfs/protocol/HdfsConstants$SafeModeAction")) {
- // improved error message for this special case
- abort("HBase is having a problem with its Hadoop jars. You may need to "
- + "recompile HBase against Hadoop version "
- + org.apache.hadoop.util.VersionInfo.getVersion()
- + " or change your hadoop jars to start properly", t);
- } else {
- abort("Unhandled exception. Starting shutdown.", t);
- }
- } finally {
- status.cleanup();
- }
+ this.activeMasterManager.setInfoPort(infoPort);
+ int timeout = conf.getInt(HConstants.ZK_SESSION_TIMEOUT, HConstants.DEFAULT_ZK_SESSION_TIMEOUT);
+ // If we're a backup master, stall until a primary to write this address
+ if (conf.getBoolean(HConstants.MASTER_TYPE_BACKUP, HConstants.DEFAULT_MASTER_TYPE_BACKUP)) {
+ LOG.debug("HMaster started in backup mode. Stalling until master znode is written.");
+ // This will only be a minute or so while the cluster starts up,
+ // so don't worry about setting watches on the parent znode
+ while (!activeMasterManager.hasActiveMaster()) {
+ LOG.debug("Waiting for master address and cluster state znode to be written.");
+ Threads.sleep(timeout);
+ }
+ }
+ MonitoredTask status = TaskMonitor.get().createStatus("Master startup");
+ status.setDescription("Master startup");
+ try {
+ if (activeMasterManager.blockUntilBecomingActiveMaster(timeout, status)) {
+ finishActiveMasterInitialization(status);
+ }
+ } catch (Throwable t) {
+ status.setStatus("Failed to become active: " + t.getMessage());
+ LOG.error(HBaseMarkers.FATAL, "Failed to become active master", t);
+ // HBASE-5680: Likely hadoop23 vs hadoop 20.x/1.x incompatibility
+ if (t instanceof NoClassDefFoundError && t.getMessage().
+ contains("org/apache/hadoop/hdfs/protocol/HdfsConstants$SafeModeAction")) {
+ // improved error message for this special case
+ abort("HBase is having a problem with its Hadoop jars. You may need to recompile " +
+ "HBase against Hadoop version " + org.apache.hadoop.util.VersionInfo.getVersion() +
+ " or change your hadoop jars to start properly", t);
+ } else {
+ abort("Unhandled exception. Starting shutdown.", t);
}
- }, getServerName().toShortString() + ".masterManager"));
+ } finally {
+ status.cleanup();
+ }
}
private void checkCompression(final TableDescriptor htd)
http://git-wip-us.apache.org/repos/asf/hbase/blob/1a11fc92/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 53390bd..3a52a16 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -243,7 +243,6 @@ public class HRegionServer extends HasThread implements
protected MemStoreFlusher cacheFlusher;
protected HeapMemoryManager hMemManager;
- protected CountDownLatch initLatch = null;
/**
* Cluster connection to be shared by services.
@@ -696,10 +695,6 @@ public class HRegionServer extends HasThread implements
return null;
}
- protected void setInitLatch(CountDownLatch latch) {
- this.initLatch = latch;
- }
-
/*
* Returns true if configured hostname should be used
*/
@@ -854,8 +849,6 @@ public class HRegionServer extends HasThread implements
// when ready.
blockAndCheckIfStopped(this.clusterStatusTracker);
- doLatch(this.initLatch);
-
// Retrieve clusterId
// Since cluster status is now up
// ID should have already been set by HMaster
http://git-wip-us.apache.org/repos/asf/hbase/blob/1a11fc92/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestTableStateManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestTableStateManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestTableStateManager.java
index 1f61ee7..81c1dfc 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestTableStateManager.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestTableStateManager.java
@@ -58,7 +58,7 @@ public class TestTableStateManager {
@Test(timeout = 60000)
public void testUpgradeFromZk() throws Exception {
final TableName tableName = TableName.valueOf(name.getMethodName());
- TEST_UTIL.startMiniCluster(2, 1);
+ TEST_UTIL.startMiniCluster(1, 1);
TEST_UTIL.shutdownMiniHBaseCluster();
ZKWatcher watcher = TEST_UTIL.getZooKeeperWatcher();
setTableStateInZK(watcher, tableName, ZooKeeperProtos.DeprecatedTableState.State.DISABLED);
http://git-wip-us.apache.org/repos/asf/hbase/blob/1a11fc92/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestPerColumnFamilyFlush.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestPerColumnFamilyFlush.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestPerColumnFamilyFlush.java
index 2e3ff7e..3b5609e 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestPerColumnFamilyFlush.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestPerColumnFamilyFlush.java
@@ -236,7 +236,6 @@ public class TestPerColumnFamilyFlush {
// CF3 shouldn't have been touched.
assertEquals(cf3MemstoreSize, oldCF3MemstoreSize);
assertEquals(totalMemstoreSize, cf3MemstoreSize.getDataSize());
- assertEquals(smallestSeqInRegionCurrentMemstore, smallestSeqCF3);
// What happens when we hit the memstore limit, but we are not able to find
// any Column Family above the threshold?