You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2017/03/08 16:14:51 UTC
hbase git commit: HBASE-17718 Difference between RS's servername and
its ephemeral node cause SSH stop working; AMENDMENT.
Repository: hbase
Updated Branches:
refs/heads/master e239e8d2d -> 6a57050c2
HBASE-17718 Difference between RS's servername and its ephemeral node cause SSH stop working; AMENDMENT.
Make test tighter by extending ServerListener so can find when Master
is in the waiting-on-regionservers state and making more assertions
about state.
Fix error where I would move on from waiting-on-regionservers if
we had waited max time.
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/6a57050c
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/6a57050c
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/6a57050c
Branch: refs/heads/master
Commit: 6a57050c24100438508199c9856b95be7024803a
Parents: e239e8d
Author: Michael Stack <st...@apache.org>
Authored: Wed Mar 8 04:58:53 2017 -0800
Committer: Michael Stack <st...@apache.org>
Committed: Wed Mar 8 08:14:24 2017 -0800
----------------------------------------------------------------------
.../org/apache/hadoop/hbase/master/HMaster.java | 8 +-
.../hadoop/hbase/master/ServerListener.java | 15 ++--
.../hadoop/hbase/master/ServerManager.java | 49 ++++++-----
.../hbase/zookeeper/DrainingServerTracker.java | 22 ++---
.../TestRSKilledWhenInitializing.java | 88 ++++++++++++--------
5 files changed, 103 insertions(+), 79 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/6a57050c/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index 501d3bd..a1cbe53 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -305,7 +305,7 @@ public class HMaster extends HRegionServer implements MasterServices {
MemoryBoundedLogMessageBuffer rsFatals;
// flag set after we become the active master (used for testing)
- private volatile boolean isActiveMaster = false;
+ private volatile boolean activeMaster = false;
// flag set after we complete initialization once active,
// it is not private since it's used in unit tests
@@ -597,7 +597,7 @@ public class HMaster extends HRegionServer implements MasterServices {
@Override
protected void waitForMasterActive(){
boolean tablesOnMaster = BaseLoadBalancer.tablesOnMaster(conf);
- while (!(tablesOnMaster && isActiveMaster)
+ while (!(tablesOnMaster && activeMaster)
&& !isStopped() && !isAborted()) {
sleeper.sleep();
}
@@ -733,7 +733,7 @@ public class HMaster extends HRegionServer implements MasterServices {
private void finishActiveMasterInitialization(MonitoredTask status)
throws IOException, InterruptedException, KeeperException, CoordinatedStateException {
- isActiveMaster = true;
+ activeMaster = true;
Thread zombieDetector = new Thread(new InitializationMonitor(this),
"ActiveMasterInitializationMonitor-" + System.currentTimeMillis());
zombieDetector.start();
@@ -2555,7 +2555,7 @@ public class HMaster extends HRegionServer implements MasterServices {
*/
@Override
public boolean isActiveMaster() {
- return isActiveMaster;
+ return activeMaster;
}
/**
http://git-wip-us.apache.org/repos/asf/hbase/blob/6a57050c/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java
index f168686..7946735 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java
@@ -22,20 +22,25 @@ import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.ServerName;
/**
- * Get notification of server events. The invocations are inline
- * so make sure your implementation is fast else you'll slow hbase.
+ * Get notification of server registration events. The invocations are inline
+ * so make sure your implementation is fast or else you'll slow hbase.
*/
@InterfaceAudience.Private
public interface ServerListener {
/**
+ * Started waiting on RegionServers to check-in.
+ */
+ default void waiting() {};
+
+ /**
* The server has joined the cluster.
* @param serverName The remote servers name.
*/
- void serverAdded(final ServerName serverName);
+ default void serverAdded(final ServerName serverName) {};
/**
* The server was removed from the cluster.
* @param serverName The remote servers name.
*/
- void serverRemoved(final ServerName serverName);
-}
+ default void serverRemoved(final ServerName serverName) {};
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hbase/blob/6a57050c/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index e6b60d8..db0a0e5 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -53,6 +53,7 @@ import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.client.ClusterConnection;
import org.apache.hadoop.hbase.client.RetriesExhaustedException;
+import org.apache.hadoop.hbase.ipc.FailedServerException;
import org.apache.hadoop.hbase.ipc.HBaseRpcController;
import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer;
@@ -772,7 +773,16 @@ public class ServerManager {
*/
private void checkForRSznode(final ServerName serverName, final ServiceException se) {
if (se.getCause() == null) return;
- if (!(se.getCause() instanceof ConnectException)) return;
+ Throwable t = se.getCause();
+ if (t instanceof ConnectException) {
+ // If this, proceed to do cleanup.
+ } else {
+ // Look for FailedServerException
+ if (!(t instanceof IOException)) return;
+ if (t.getCause() == null) return;
+ if (!(t.getCause() instanceof FailedServerException)) return;
+ // Ok, found FailedServerException -- continue.
+ }
if (!isServerOnline(serverName)) return;
// We think this server is online. Check it has a znode up. Currently, a RS
// registers an ephereral znode in zk. If not present, something is up. Maybe
@@ -1030,20 +1040,19 @@ public class ServerManager {
*
* @throws InterruptedException
*/
- public void waitForRegionServers(MonitoredTask status)
- throws InterruptedException {
+ public void waitForRegionServers(MonitoredTask status) throws InterruptedException {
final long interval = this.master.getConfiguration().
- getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
+ getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
final long timeout = this.master.getConfiguration().
- getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
+ getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
// Min is not an absolute; just a friction making us wait longer on server checkin.
int minToStart = getMinToStart();
int maxToStart = this.master.getConfiguration().
- getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
+ getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
if (maxToStart < minToStart) {
LOG.warn(String.format("The value of '%s' (%d) is set less than '%s' (%d), ignoring.",
- WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
- WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
+ WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
+ WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
maxToStart = Integer.MAX_VALUE;
}
@@ -1060,19 +1069,19 @@ public class ServerManager {
// Next, we will keep cycling if ANY of the following three conditions are true:
// 1. The time since a regionserver registered is < interval (means servers are actively checking in).
// 2. We are under the total timeout.
- // 3. The count of servers is < minimum expected AND we are within timeout (this just puts up
- // a little friction making us wait a bit longer if < minimum servers).
+ // 3. The count of servers is < minimum.
+ for (ServerListener listener: this.listeners) {
+ listener.waiting();
+ }
while (!this.master.isStopped() && count < maxToStart &&
- (((lastCountChange + interval) > now) ||
- (timeout > slept) ||
- ((count < minToStart) && (timeout > slept)))) {
+ ((lastCountChange + interval) > now || timeout > slept || count < minToStart)) {
// Log some info at every interval time or if there is a change
if (oldCount != count || lastLogTime + interval < now) {
lastLogTime = now;
String msg =
- "Waiting for RegionServer count=" + count + " to settle; waited "+
- slept + "ms, expecting minimum=" + minToStart + "server(s) (max="+ getStrForMax(maxToStart) + "server(s)), " +
- "timeout=" + timeout + "ms, lastChange=" + (lastCountChange - now) + "ms";
+ "Waiting on RegionServer count=" + count + " to settle; waited="+
+ slept + "ms, expecting min=" + minToStart + " server(s), max="+ getStrForMax(maxToStart) +
+ " server(s), " + "timeout=" + timeout + "ms, lastChange=" + (lastCountChange - now) + "ms";
LOG.info(msg);
status.setStatus(msg);
}
@@ -1089,11 +1098,9 @@ public class ServerManager {
lastCountChange = now;
}
}
-
- LOG.info("Finished waiting for RegionServer count=" + count + " to settle, slept for " + slept + "ms," +
- " expecting minimum=" + minToStart + " server(s) (max=" + getStrForMax(maxToStart) + " server(s),"+
- " Master is "+ (this.master.isStopped() ? "stopped.": "running")
- );
+ LOG.info("Finished wait on RegionServer count=" + count + "; waited=" + slept + "ms," +
+ " expected min=" + minToStart + " server(s), max=" + getStrForMax(maxToStart) + " server(s),"+
+ " master is "+ (this.master.isStopped() ? "stopped.": "running"));
}
private String getStrForMax(final int max) {
http://git-wip-us.apache.org/repos/asf/hbase/blob/6a57050c/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java
index 32e0862..a4880df 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java
@@ -70,22 +70,14 @@ public class DrainingServerTracker extends ZooKeeperListener {
public void start() throws KeeperException, IOException {
watcher.registerListener(this);
// Add a ServerListener to check if a server is draining when it's added.
- serverManager.registerListener(
- new ServerListener() {
-
- @Override
- public void serverAdded(ServerName sn) {
- if (drainingServers.contains(sn)){
- serverManager.addServerToDrainList(sn);
- }
- }
-
- @Override
- public void serverRemoved(ServerName serverName) {
- // no-op
- }
+ serverManager.registerListener(new ServerListener() {
+ @Override
+ public void serverAdded(ServerName sn) {
+ if (drainingServers.contains(sn)){
+ serverManager.addServerToDrainList(sn);
}
- );
+ }
+ });
List<String> servers =
ZKUtil.listChildrenAndWatchThem(watcher, watcher.znodePaths.drainingZNode);
add(servers);
http://git-wip-us.apache.org/repos/asf/hbase/blob/6a57050c/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
index c01db2a..304bfe7 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
@@ -19,6 +19,7 @@
package org.apache.hadoop.hbase.regionserver;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.util.List;
@@ -38,6 +39,7 @@ import org.apache.hadoop.hbase.LocalHBaseCluster;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.master.HMaster;
+import org.apache.hadoop.hbase.master.ServerListener;
import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse;
import org.apache.hadoop.hbase.testclassification.MediumTests;
@@ -92,52 +94,57 @@ public class TestRSKilledWhenInitializing {
RegisterAndDieRegionServer.class);
final MasterThread master = startMaster(cluster.getMasters().get(0));
try {
- masterActive.set(true);
- // Now start regionservers.
- // First RS to report for duty will kill itself when it gets a response.
- // See below in the RegisterAndDieRegionServer handleReportForDutyResponse.
+ // Master is up waiting on RegionServers to check in. Now start RegionServers.
for (int i = 0; i < NUM_RS; i++) {
cluster.getRegionServers().get(i).start();
}
- // Now wait on master to see NUM_RS + 1 servers as being online, NUM_RS and itself.
- // Then wait until the killed RS gets removed from zk and triggers Master to remove
- // it from list of online RS.
- List<ServerName> onlineServersList =
- master.getMaster().getServerManager().getOnlineServersList();
- while (onlineServersList.size() < NUM_RS + 1) {
- // Spin till we see NUM_RS + Master in online servers list.
+ // Now wait on master to see NUM_RS + 1 servers as being online, thats NUM_RS plus
+ // the Master itself (because Master hosts hbase:meta and checks in as though it a RS).
+ List<ServerName> onlineServersList = null;
+ do {
onlineServersList = master.getMaster().getServerManager().getOnlineServersList();
- }
- LOG.info(onlineServersList);
- assertEquals(NUM_RS + 1, onlineServersList.size());
- // Steady state. How many regions open?
- // Wait until killedRS is set
+ } while (onlineServersList.size() < (NUM_RS + 1));
+ // Wait until killedRS is set. Means RegionServer is starting to go down.
while (killedRS.get() == null) {
- Threads.sleep(10);
+ Threads.sleep(1);
+ }
+ // Wait on the RegionServer to fully die.
+ while (cluster.getLiveRegionServers().size() > NUM_RS) {
+ Threads.sleep(1);
+ }
+ // Make sure Master is fully up before progressing. Could take a while if regions
+ // being reassigned.
+ while (!master.getMaster().isInitialized()) {
+ Threads.sleep(1);
}
- final int regionsOpenCount = master.getMaster().getAssignmentManager().getNumRegionsOpened();
+
+ // Now in steady state. How many regions open? Master should have too many regionservers
+ // showing still. The downed RegionServer should still be showing as registered.
+ assertTrue(master.getMaster().getServerManager().isServerOnline(killedRS.get()));
// Find non-meta region (namespace?) and assign to the killed server. That'll trigger cleanup.
- Map<HRegionInfo, ServerName> assigments =
- master.getMaster().getAssignmentManager().getRegionStates().getRegionAssignments();
+ Map<HRegionInfo, ServerName> assignments = null;
+ do {
+ assignments = master.getMaster().getAssignmentManager().getRegionStates().getRegionAssignments();
+ } while (assignments == null || assignments.size() < 2);
HRegionInfo hri = null;
- for (Map.Entry<HRegionInfo, ServerName> e: assigments.entrySet()) {
+ for (Map.Entry<HRegionInfo, ServerName> e: assignments.entrySet()) {
if (e.getKey().isMetaRegion()) continue;
hri = e.getKey();
break;
}
// Try moving region to the killed server. It will fail. As by-product, we will
// remove the RS from Master online list because no corresponding znode.
+ assertEquals(NUM_RS + 1, master.getMaster().getServerManager().getOnlineServersList().size());
LOG.info("Move " + hri.getEncodedName() + " to " + killedRS.get());
master.getMaster().move(hri.getEncodedNameAsBytes(),
Bytes.toBytes(killedRS.get().toString()));
- while (onlineServersList.size() > NUM_RS) {
+ // Wait until the RS no longer shows as registered in Master.
+ while (onlineServersList.size() > (NUM_RS + 1)) {
Thread.sleep(100);
onlineServersList = master.getMaster().getServerManager().getOnlineServersList();
}
- // Just for kicks, ensure namespace was put back on the old server after above failed move.
- assertEquals(regionsOpenCount,
- master.getMaster().getAssignmentManager().getNumRegionsOpened());
} finally {
+ // Shutdown is messy with complaints about fs being closed. Why? TODO.
cluster.shutdown();
cluster.join();
TEST_UTIL.shutdownMiniDFSCluster();
@@ -146,19 +153,32 @@ public class TestRSKilledWhenInitializing {
}
}
+ /**
+ * Start Master. Get as far as the state where Master is waiting on
+ * RegionServers to check in, then return.
+ */
private MasterThread startMaster(MasterThread master) {
master.start();
- long startTime = System.currentTimeMillis();
- while (!master.getMaster().isInitialized()) {
- try {
- Thread.sleep(100);
- } catch (InterruptedException ignored) {
- LOG.info("Interrupted: ignoring");
- }
- if (System.currentTimeMillis() > startTime + 30000) {
- throw new RuntimeException("Master not active after 30 seconds");
+ // It takes a while until ServerManager creation to happen inside Master startup.
+ while (master.getMaster().getServerManager() == null) {
+ continue;
+ }
+ // Set a listener for the waiting-on-RegionServers state. We want to wait
+ // until this condition before we leave this method and start regionservers.
+ final AtomicBoolean waiting = new AtomicBoolean(false);
+ if (master.getMaster().getServerManager() == null) throw new NullPointerException("SM");
+ master.getMaster().getServerManager().registerListener(new ServerListener() {
+ @Override
+ public void waiting() {
+ waiting.set(true);
}
+ });
+ // Wait until the Master gets to place where it is waiting on RegionServers to check in.
+ while (!waiting.get()) {
+ continue;
}
+ // Set the global master-is-active; gets picked up by regionservers later.
+ masterActive.set(true);
return master;
}