You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2018/03/07 17:25:19 UTC
hbase git commit: Revert "HBASE-20137 TestRSGroups is flakey"
Repository: hbase
Updated Branches:
refs/heads/branch-2 f5c8713bd -> 96a42b735
Revert "HBASE-20137 TestRSGroups is flakey"
Revert. Fix is not right.
This reverts commit 6d1740d498d3f0f301a87a0a0cd598827790efa5.
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/96a42b73
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/96a42b73
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/96a42b73
Branch: refs/heads/branch-2
Commit: 96a42b7359674e787bd4d2e48e4622e9f5861645
Parents: f5c8713
Author: Michael Stack <st...@apache.org>
Authored: Wed Mar 7 09:24:09 2018 -0800
Committer: Michael Stack <st...@apache.org>
Committed: Wed Mar 7 09:25:02 2018 -0800
----------------------------------------------------------------------
.../hadoop/hbase/master/ServerManager.java | 17 +++----
.../FailedRemoteDispatchException.java | 9 +---
.../assignment/RegionTransitionProcedure.java | 15 +++----
.../master/assignment/UnassignProcedure.java | 47 ++++++++------------
.../assignment/TestAssignmentManager.java | 10 +----
5 files changed, 34 insertions(+), 64 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/96a42b73/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index e2f0b6b..06d6c8b 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -555,17 +555,15 @@ public class ServerManager {
}
/*
- * Expire the passed server. Add it to list of dead servers and queue a shutdown processing.
- * @return True if we expired passed <code>serverName</code> else false if we failed to schedule
- * an expire (and attendant ServerCrashProcedure -- some clients are dependent on
- * server crash procedure being queued and need to know if has not been queued).
+ * Expire the passed server. Add it to list of dead servers and queue a
+ * shutdown processing.
*/
- public synchronized boolean expireServer(final ServerName serverName) {
+ public synchronized void expireServer(final ServerName serverName) {
if (serverName.equals(master.getServerName())) {
if (!(master.isAborted() || master.isStopped())) {
master.stop("We lost our znode?");
}
- return false;
+ return;
}
if (!master.isServerCrashProcessingEnabled()) {
LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
@@ -575,13 +573,13 @@ public class ServerManager {
// the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287
master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName);
this.queuedDeadServers.add(serverName);
- return false;
+ return;
}
if (this.deadservers.isDeadServer(serverName)) {
// TODO: Can this happen? It shouldn't be online in this case?
LOG.warn("Expiration of " + serverName +
" but server shutdown already in progress");
- return false;
+ return;
}
moveFromOnlineToDeadServers(serverName);
@@ -593,7 +591,7 @@ public class ServerManager {
if (this.onlineServers.isEmpty()) {
master.stop("Cluster shutdown set; onlineServer=0");
}
- return false;
+ return;
}
LOG.info("Processing expiration of " + serverName + " on " + this.master.getServerName());
master.getAssignmentManager().submitServerCrash(serverName, true);
@@ -604,7 +602,6 @@ public class ServerManager {
listener.serverRemoved(serverName);
}
}
- return true;
}
@VisibleForTesting
http://git-wip-us.apache.org/repos/asf/hbase/blob/96a42b73/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java
index 7e98675..b459cfe 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java
@@ -21,17 +21,12 @@ import org.apache.hadoop.hbase.HBaseIOException;
import org.apache.yetus.audience.InterfaceAudience;
/**
- * Used internally signaling failed queue of a remote procedure operation.
- * Usually happens because no such remote server; it is being processed as crashed so it is not
- * online at time of RPC. Otherwise, something unexpected happened.
+ * Used internally signaling failed queue of a remote procedure
+ * operation.
*/
@SuppressWarnings("serial")
@InterfaceAudience.Private
public class FailedRemoteDispatchException extends HBaseIOException {
- public FailedRemoteDispatchException() {
- super();
- }
-
public FailedRemoteDispatchException(String msg) {
super(msg);
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/96a42b73/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
index d427c58..6c63cb8 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
@@ -182,8 +182,10 @@ public abstract class RegionTransitionProcedure
public void remoteCallFailed(final MasterProcedureEnv env,
final ServerName serverName, final IOException exception) {
final RegionStateNode regionNode = getRegionState(env);
- LOG.warn("Remote call failed {}; rit={}, exception={}", this, regionNode.getState(),
- exception.toString());
+ String msg = exception.getMessage() == null? exception.getClass().getSimpleName():
+ exception.getMessage();
+ LOG.warn("Remote call failed " + this + "; " + regionNode.toShortString() +
+ "; exception=" + msg);
if (remoteCallFailed(env, regionNode, exception)) {
// NOTE: This call to wakeEvent puts this Procedure back on the scheduler.
// Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond
@@ -218,14 +220,9 @@ public abstract class RegionTransitionProcedure
// backtrack on stuff like the 'suspend' done above -- tricky as the 'wake' requests us -- and
// ditto up in the caller; it needs to undo state changes. Inside in remoteCallFailed, it does
// wake to undo the above suspend.
- //
- // We fail the addOperationToNode usually because there is no such remote server (it has
- // crashed and we are currently processing it or something went badly wrong and we have a
- // bad server).
if (!env.getRemoteDispatcher().addOperationToNode(targetServer, this)) {
- remoteCallFailed(env, targetServer, targetServer == null?
- new FailedRemoteDispatchException():
- new FailedRemoteDispatchException(targetServer.toShortString()));
+ remoteCallFailed(env, targetServer,
+ new FailedRemoteDispatchException(this + " to " + targetServer));
return false;
}
return true;
http://git-wip-us.apache.org/repos/asf/hbase/blob/96a42b73/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
index fd3d78d..3454d96 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
@@ -249,12 +249,17 @@ public class UnassignProcedure extends RegionTransitionProcedure {
final IOException exception) {
// TODO: Is there on-going rpc to cleanup?
if (exception instanceof ServerCrashException) {
- // This exception comes from ServerCrashProcedure after it is done with log splitting.
- // SCP found this region as a Region-In-Transition (RIT). Its call into here says it is ok to
- // let this procedure go on to a complete close now. This will release lock on this region so
- // subsequent action on region can succeed; e.g. the assign that follows this unassign when
- // a move (w/o wait on SCP the assign could run w/o logs being split so data loss).
- reportTransitionCLOSED(env, regionNode);
+ // This exception comes from ServerCrashProcedure after log splitting.
+ // SCP found this region as a RIT. Its call into here says it is ok to let this procedure go
+ // on to a complete close now. This will release lock on this region so subsequent action on
+ // region can succeed; e.g. the assign that follows this unassign when a move (w/o wait on SCP
+ // the assign could run w/o logs being split so data loss).
+ try {
+ reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM);
+ } catch (UnexpectedStateException e) {
+ // Should never happen.
+ throw new RuntimeException(e);
+ }
} else if (exception instanceof RegionServerAbortedException ||
exception instanceof RegionServerStoppedException ||
exception instanceof ServerNotRunningYetException) {
@@ -268,33 +273,17 @@ public class UnassignProcedure extends RegionTransitionProcedure {
exception);
setTransitionState(RegionTransitionState.REGION_TRANSITION_FINISH);
} else {
- LOG.warn("Expiring server {}; rit={}, exception={}", this, regionNode.getState(),
- exception.toString());
- if (env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation())) {
- // Return false so this procedure stays in suspended state. It will be woken up by
- // ServerCrashProcedure when it notices this RIT and calls this method again but with
- // a SCPException -- see above.
- // TODO: Add a SCP as a new subprocedure that we now come to depend on.
- return false;
- } else {
- LOG.warn("Failed expire of {}; presumed CRASHED; moving region to CLOSED state",
- regionNode.getRegionLocation());
- reportTransitionCLOSED(env, regionNode);
- }
+ LOG.warn("Expiring server " + this + "; " + regionNode.toShortString() +
+ ", exception=" + exception);
+ env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation());
+ // Return false so this procedure stays in suspended state. It will be woken up by a
+ // ServerCrashProcedure when it notices this RIT.
+ // TODO: Add a SCP as a new subprocedure that we now come to depend on.
+ return false;
}
return true;
}
- private void reportTransitionCLOSED(final MasterProcedureEnv env,
- final RegionStateNode regionNode) {
- try {
- reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM);
- } catch (UnexpectedStateException e) {
- // Should never happen.
- throw new RuntimeException(e);
- }
- }
-
@Override
public void toStringClassDetails(StringBuilder sb) {
super.toStringClassDetails(sb);
http://git-wip-us.apache.org/repos/asf/hbase/blob/96a42b73/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
index c5eb9ec..5003d51 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
@@ -722,7 +722,7 @@ public class TestAssignmentManager {
}
private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
- public static final int TYPES_OF_FAILURE = 7;
+ public static final int TYPES_OF_FAILURE = 6;
private int invocations;
@Override
@@ -734,14 +734,6 @@ public class TestAssignmentManager {
case 2: throw new RegionServerStoppedException("Fake!");
case 3: throw new ServerNotRunningYetException("Fake!");
case 4:
- // We will expire the server that we failed to rpc against.
- throw new FailedRemoteDispatchException("Fake!");
- case 5:
- // Mark this regionserver as already expiring so we go different code route; i.e. we
- // FAIL to expire the remote server and presume ok to move region to CLOSED. HBASE-20137.
- TestAssignmentManager.this.master.getServerManager().expireServer(server);
- throw new FailedRemoteDispatchException("Fake!");
- case 6:
LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
executor.schedule(new Runnable() {
@Override