You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2018/11/14 16:17:31 UTC

hbase git commit: HBASE-21440 Assign procedure on the crashed server is not properly interrupted

Repository: hbase
Updated Branches:
  refs/heads/branch-2.0 eee4cf188 -> 3e1f24ed4


HBASE-21440 Assign procedure on the crashed server is not properly interrupted

Signed-off-by: Allan Yang <al...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/3e1f24ed
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/3e1f24ed
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/3e1f24ed

Branch: refs/heads/branch-2.0
Commit: 3e1f24ed4e018182cd5b098b7a0d88c5ec8e35a4
Parents: eee4cf1
Author: stack <st...@apache.org>
Authored: Wed Nov 14 08:16:26 2018 -0800
Committer: stack <st...@apache.org>
Committed: Wed Nov 14 08:16:26 2018 -0800

----------------------------------------------------------------------
 .../procedure2/RemoteProcedureDispatcher.java   |  2 +-
 .../master/assignment/AssignProcedure.java      | 12 ++++++
 .../master/assignment/AssignmentManager.java    | 39 --------------------
 .../assignment/RegionTransitionProcedure.java   |  6 ++-
 .../master/procedure/ServerCrashProcedure.java  | 16 ++++----
 .../master/snapshot/TestAssignProcedure.java    |  3 +-
 6 files changed, 28 insertions(+), 50 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/3e1f24ed/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
index d0b8280..d068bc3 100644
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
@@ -233,7 +233,7 @@ public abstract class RemoteProcedureDispatcher<TEnv, TRemote extends Comparable
   public interface RemoteProcedure<TEnv, TRemote> {
     RemoteOperation remoteCallBuild(TEnv env, TRemote remote);
     void remoteCallCompleted(TEnv env, TRemote remote, RemoteOperation response);
-    void remoteCallFailed(TEnv env, TRemote remote, IOException exception);
+    boolean remoteCallFailed(TEnv env, TRemote remote, IOException exception);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hbase/blob/3e1f24ed/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
index a0297b3..f2b967a 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
@@ -34,6 +34,7 @@ import org.apache.hadoop.hbase.master.TableStateManager;
 import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
 import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
 import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation;
+import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
 import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
 import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
 import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
@@ -388,6 +389,17 @@ public class AssignProcedure extends RegionTransitionProcedure {
   @Override
   protected boolean remoteCallFailed(final MasterProcedureEnv env, final RegionStateNode regionNode,
       final IOException exception) {
+    RegionTransitionState tState = getTransitionState();
+    if (tState == RegionTransitionState.REGION_TRANSITION_FINISH
+        && exception instanceof ServerCrashException) {
+      // if we found that AssignProcedure is at this stage, then ServerCerash handling may/may not
+      // have any effect
+      // depending upon the race between handling of the failure and execution at
+      // REGION_TRANSITION_FINISH state
+      LOG.warn("Assign Procedure is at state:" + tState
+          + ", so Handling of Server Crash may not have any affect");
+      return false;
+    }
     handleFailure(env, regionNode);
     return true;
   }

http://git-wip-us.apache.org/repos/asf/hbase/blob/3e1f24ed/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index 04a25c7..00b70d7 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -42,7 +42,6 @@ import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.YouAreDeadException;
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.client.RegionInfoBuilder;
-import org.apache.hadoop.hbase.client.RegionReplicaUtil;
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.client.TableState;
 import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
@@ -64,7 +63,6 @@ import org.apache.hadoop.hbase.master.balancer.FavoredStochasticBalancer;
 import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
 import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
 import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
-import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
 import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
 import org.apache.hadoop.hbase.procedure2.Procedure;
 import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
@@ -88,7 +86,6 @@ import org.slf4j.LoggerFactory;
 import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
 
 import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
-import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
@@ -1858,42 +1855,6 @@ public class AssignmentManager implements ServerListener {
     }
   }
 
-  /**
-   * Handle RIT of meta region against crashed server.
-   * Only used when ServerCrashProcedure is not enabled.
-   * See handleRIT in ServerCrashProcedure for similar function.
-   *
-   * @param serverName Server that has already crashed
-   */
-  public void handleMetaRITOnCrashedServer(ServerName serverName) {
-    RegionInfo hri = RegionReplicaUtil
-        .getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO,
-            RegionInfo.DEFAULT_REPLICA_ID);
-    RegionState regionStateNode = getRegionStates().getRegionState(hri);
-    if (regionStateNode == null) {
-      LOG.warn("RegionStateNode is null for " + hri);
-      return;
-    }
-    ServerName rsnServerName = regionStateNode.getServerName();
-    if (rsnServerName != null && !rsnServerName.equals(serverName)) {
-      return;
-    } else if (rsnServerName == null) {
-      LOG.warn("Empty ServerName in RegionStateNode; proceeding anyways in case latched " +
-          "RecoverMetaProcedure so meta latch gets cleaned up.");
-    }
-    // meta has been assigned to crashed server.
-    LOG.info("Meta assigned to crashed " + serverName + "; reassigning...");
-    // Handle failure and wake event
-    RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri);
-    // Do not need to consider for REGION_TRANSITION_QUEUE step
-    if (rtp != null && rtp.isMeta() &&
-        rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) {
-      LOG.debug("Failing " + rtp.toString());
-      rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName,
-          new ServerCrashException(rtp.getProcId(), serverName));
-    }
-  }
-
   @VisibleForTesting
   MasterServices getMaster() {
     return master;

http://git-wip-us.apache.org/repos/asf/hbase/blob/3e1f24ed/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
index 00ace72..b8fda06 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
@@ -205,7 +205,7 @@ public abstract class RegionTransitionProcedure
     this.transitionState = state;
   }
 
-  RegionTransitionState getTransitionState() {
+  protected RegionTransitionState getTransitionState() {
     return transitionState;
   }
 
@@ -245,7 +245,7 @@ public abstract class RegionTransitionProcedure
   }
 
   @Override
-  public synchronized void remoteCallFailed(final MasterProcedureEnv env,
+  public synchronized boolean remoteCallFailed(final MasterProcedureEnv env,
       final ServerName serverName, final IOException exception) {
     final RegionStateNode regionNode = getRegionState(env);
     LOG.warn("Remote call failed {} {}", this, regionNode.toShortString(), exception);
@@ -254,7 +254,9 @@ public abstract class RegionTransitionProcedure
       // Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond
       // this method. Just get out of this current processing quickly.
       regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
+      return true;
     }
+    return false;
     // else leave the procedure in suspended state; it is waiting on another call to this callback
   }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/3e1f24ed/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
index 6ec0079..9fe5545 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@@ -419,13 +419,15 @@ public class ServerCrashProcedure
       if (sce == null) {
         sce = new ServerCrashException(getProcId(), getServerName());
       }
-      rtp.remoteCallFailed(env, this.serverName, sce);
-      // If an assign, remove from passed-in list of regions so we subsequently do not create
-      // a new assign; the exisitng assign after the call to remoteCallFailed will recalibrate
-      // and assign to a server other than the crashed one; no need to create new assign.
-      // If an unassign, do not return this region; the above cancel will wake up the unassign and
-      // it will complete. Done.
-      it.remove();
+      if(rtp.remoteCallFailed(env, this.serverName, sce)) {
+        // If an assign, remove from passed-in list of regions so we subsequently do not create
+        // a new assign; the exisitng assign after the call to remoteCallFailed will recalibrate
+        // and assign to a server other than the crashed one; no need to create new assign.
+        // If an unassign, do not return this region; the above cancel will wake up the unassign and
+        // it will complete. Done.
+        it.remove();
+      }
+
     }
     return toAssign;
   }

http://git-wip-us.apache.org/repos/asf/hbase/blob/3e1f24ed/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java
index 8836c9f..5067fc3 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java
@@ -108,11 +108,12 @@ public class TestAssignProcedure {
     }
 
     @Override
-    public void remoteCallFailed(final MasterProcedureEnv env,
+    public boolean remoteCallFailed(final MasterProcedureEnv env,
         final ServerName serverName, final IOException exception) {
       // Just skip this remoteCallFailed. Its too hard to mock. Assert it is called though.
       // Happens after the code we are testing has been called.
       this.remoteCallFailedWasCalled.set(true);
+      return true;
     }
   };