You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2018/11/14 16:17:31 UTC
hbase git commit: HBASE-21440 Assign procedure on the crashed server
is not properly interrupted
Repository: hbase
Updated Branches:
refs/heads/branch-2.0 eee4cf188 -> 3e1f24ed4
HBASE-21440 Assign procedure on the crashed server is not properly interrupted
Signed-off-by: Allan Yang <al...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/3e1f24ed
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/3e1f24ed
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/3e1f24ed
Branch: refs/heads/branch-2.0
Commit: 3e1f24ed4e018182cd5b098b7a0d88c5ec8e35a4
Parents: eee4cf1
Author: stack <st...@apache.org>
Authored: Wed Nov 14 08:16:26 2018 -0800
Committer: stack <st...@apache.org>
Committed: Wed Nov 14 08:16:26 2018 -0800
----------------------------------------------------------------------
.../procedure2/RemoteProcedureDispatcher.java | 2 +-
.../master/assignment/AssignProcedure.java | 12 ++++++
.../master/assignment/AssignmentManager.java | 39 --------------------
.../assignment/RegionTransitionProcedure.java | 6 ++-
.../master/procedure/ServerCrashProcedure.java | 16 ++++----
.../master/snapshot/TestAssignProcedure.java | 3 +-
6 files changed, 28 insertions(+), 50 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/3e1f24ed/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
index d0b8280..d068bc3 100644
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
@@ -233,7 +233,7 @@ public abstract class RemoteProcedureDispatcher<TEnv, TRemote extends Comparable
public interface RemoteProcedure<TEnv, TRemote> {
RemoteOperation remoteCallBuild(TEnv env, TRemote remote);
void remoteCallCompleted(TEnv env, TRemote remote, RemoteOperation response);
- void remoteCallFailed(TEnv env, TRemote remote, IOException exception);
+ boolean remoteCallFailed(TEnv env, TRemote remote, IOException exception);
}
/**
http://git-wip-us.apache.org/repos/asf/hbase/blob/3e1f24ed/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
index a0297b3..f2b967a 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
@@ -34,6 +34,7 @@ import org.apache.hadoop.hbase.master.TableStateManager;
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation;
+import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
@@ -388,6 +389,17 @@ public class AssignProcedure extends RegionTransitionProcedure {
@Override
protected boolean remoteCallFailed(final MasterProcedureEnv env, final RegionStateNode regionNode,
final IOException exception) {
+ RegionTransitionState tState = getTransitionState();
+ if (tState == RegionTransitionState.REGION_TRANSITION_FINISH
+ && exception instanceof ServerCrashException) {
+ // if we found that AssignProcedure is at this stage, then ServerCerash handling may/may not
+ // have any effect
+ // depending upon the race between handling of the failure and execution at
+ // REGION_TRANSITION_FINISH state
+ LOG.warn("Assign Procedure is at state:" + tState
+ + ", so Handling of Server Crash may not have any affect");
+ return false;
+ }
handleFailure(env, regionNode);
return true;
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/3e1f24ed/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index 04a25c7..00b70d7 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -42,7 +42,6 @@ import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.YouAreDeadException;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
-import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.TableState;
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
@@ -64,7 +63,6 @@ import org.apache.hadoop.hbase.master.balancer.FavoredStochasticBalancer;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
-import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
@@ -88,7 +86,6 @@ import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
-import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
@@ -1858,42 +1855,6 @@ public class AssignmentManager implements ServerListener {
}
}
- /**
- * Handle RIT of meta region against crashed server.
- * Only used when ServerCrashProcedure is not enabled.
- * See handleRIT in ServerCrashProcedure for similar function.
- *
- * @param serverName Server that has already crashed
- */
- public void handleMetaRITOnCrashedServer(ServerName serverName) {
- RegionInfo hri = RegionReplicaUtil
- .getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO,
- RegionInfo.DEFAULT_REPLICA_ID);
- RegionState regionStateNode = getRegionStates().getRegionState(hri);
- if (regionStateNode == null) {
- LOG.warn("RegionStateNode is null for " + hri);
- return;
- }
- ServerName rsnServerName = regionStateNode.getServerName();
- if (rsnServerName != null && !rsnServerName.equals(serverName)) {
- return;
- } else if (rsnServerName == null) {
- LOG.warn("Empty ServerName in RegionStateNode; proceeding anyways in case latched " +
- "RecoverMetaProcedure so meta latch gets cleaned up.");
- }
- // meta has been assigned to crashed server.
- LOG.info("Meta assigned to crashed " + serverName + "; reassigning...");
- // Handle failure and wake event
- RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri);
- // Do not need to consider for REGION_TRANSITION_QUEUE step
- if (rtp != null && rtp.isMeta() &&
- rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) {
- LOG.debug("Failing " + rtp.toString());
- rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName,
- new ServerCrashException(rtp.getProcId(), serverName));
- }
- }
-
@VisibleForTesting
MasterServices getMaster() {
return master;
http://git-wip-us.apache.org/repos/asf/hbase/blob/3e1f24ed/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
index 00ace72..b8fda06 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
@@ -205,7 +205,7 @@ public abstract class RegionTransitionProcedure
this.transitionState = state;
}
- RegionTransitionState getTransitionState() {
+ protected RegionTransitionState getTransitionState() {
return transitionState;
}
@@ -245,7 +245,7 @@ public abstract class RegionTransitionProcedure
}
@Override
- public synchronized void remoteCallFailed(final MasterProcedureEnv env,
+ public synchronized boolean remoteCallFailed(final MasterProcedureEnv env,
final ServerName serverName, final IOException exception) {
final RegionStateNode regionNode = getRegionState(env);
LOG.warn("Remote call failed {} {}", this, regionNode.toShortString(), exception);
@@ -254,7 +254,9 @@ public abstract class RegionTransitionProcedure
// Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond
// this method. Just get out of this current processing quickly.
regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
+ return true;
}
+ return false;
// else leave the procedure in suspended state; it is waiting on another call to this callback
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/3e1f24ed/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
index 6ec0079..9fe5545 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@@ -419,13 +419,15 @@ public class ServerCrashProcedure
if (sce == null) {
sce = new ServerCrashException(getProcId(), getServerName());
}
- rtp.remoteCallFailed(env, this.serverName, sce);
- // If an assign, remove from passed-in list of regions so we subsequently do not create
- // a new assign; the exisitng assign after the call to remoteCallFailed will recalibrate
- // and assign to a server other than the crashed one; no need to create new assign.
- // If an unassign, do not return this region; the above cancel will wake up the unassign and
- // it will complete. Done.
- it.remove();
+ if(rtp.remoteCallFailed(env, this.serverName, sce)) {
+ // If an assign, remove from passed-in list of regions so we subsequently do not create
+ // a new assign; the exisitng assign after the call to remoteCallFailed will recalibrate
+ // and assign to a server other than the crashed one; no need to create new assign.
+ // If an unassign, do not return this region; the above cancel will wake up the unassign and
+ // it will complete. Done.
+ it.remove();
+ }
+
}
return toAssign;
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/3e1f24ed/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java
index 8836c9f..5067fc3 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java
@@ -108,11 +108,12 @@ public class TestAssignProcedure {
}
@Override
- public void remoteCallFailed(final MasterProcedureEnv env,
+ public boolean remoteCallFailed(final MasterProcedureEnv env,
final ServerName serverName, final IOException exception) {
// Just skip this remoteCallFailed. Its too hard to mock. Assert it is called though.
// Happens after the code we are testing has been called.
this.remoteCallFailedWasCalled.set(true);
+ return true;
}
};