You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zh...@apache.org on 2018/08/24 02:08:34 UTC

hbase git commit: HBASE-21095 The timeout retry logic for several procedures are broken after master restarts

Repository: hbase
Updated Branches:
  refs/heads/master 780670ede -> aac1a7014


HBASE-21095 The timeout retry logic for several procedures are broken after master restarts


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/aac1a701
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/aac1a701
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/aac1a701

Branch: refs/heads/master
Commit: aac1a70147ec950c98db5f31d8906ce54547523a
Parents: 780670e
Author: zhangduo <zh...@apache.org>
Authored: Thu Aug 23 22:16:13 2018 +0800
Committer: Duo Zhang <zh...@apache.org>
Committed: Fri Aug 24 10:07:31 2018 +0800

----------------------------------------------------------------------
 .../master/assignment/AssignmentManager.java    | 67 +++++++++++++++-----
 .../assignment/TransitRegionStateProcedure.java | 12 ++--
 .../master/procedure/ServerCrashProcedure.java  | 15 +++--
 .../assignment/TestCloseRegionWhileRSCrash.java | 11 +++-
 4 files changed, 75 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/aac1a701/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index 9b020c8..a91f8e4 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -1414,12 +1414,25 @@ public class AssignmentManager implements ServerListener {
   //  Region Status update
   //  Should only be called in TransitRegionStateProcedure
   // ============================================================================================
+  private void transitStateAndUpdate(RegionStateNode regionNode, RegionState.State newState,
+      RegionState.State... expectedStates) throws IOException {
+    RegionState.State state = regionNode.getState();
+    regionNode.transitionState(newState, expectedStates);
+    boolean succ = false;
+    try {
+      regionStateStore.updateRegionLocation(regionNode);
+      succ = true;
+    } finally {
+      if (!succ) {
+        // revert
+        regionNode.setState(state);
+      }
+    }
+  }
 
   // should be called within the synchronized block of RegionStateNode
   void regionOpening(RegionStateNode regionNode) throws IOException {
-    regionNode.transitionState(State.OPENING, RegionStates.STATES_EXPECTED_ON_OPEN);
-    regionStateStore.updateRegionLocation(regionNode);
-
+    transitStateAndUpdate(regionNode, State.OPENING, RegionStates.STATES_EXPECTED_ON_OPEN);
     regionStates.addRegionToServer(regionNode);
     // update the operation count metrics
     metrics.incrementOperationCounter();
@@ -1429,23 +1442,33 @@ public class AssignmentManager implements ServerListener {
   // The parameter 'giveUp' means whether we will try to open the region again, if it is true, then
   // we will persist the FAILED_OPEN state into hbase:meta.
   void regionFailedOpen(RegionStateNode regionNode, boolean giveUp) throws IOException {
-    if (regionNode.getRegionLocation() != null) {
-      regionStates.removeRegionFromServer(regionNode.getRegionLocation(), regionNode);
-    }
+    RegionState.State state = regionNode.getState();
+    ServerName regionLocation = regionNode.getRegionLocation();
     if (giveUp) {
       regionNode.setState(State.FAILED_OPEN);
       regionNode.setRegionLocation(null);
-      regionStateStore.updateRegionLocation(regionNode);
+      boolean succ = false;
+      try {
+        regionStateStore.updateRegionLocation(regionNode);
+        succ = true;
+      } finally {
+        if (!succ) {
+          // revert
+          regionNode.setState(state);
+          regionNode.setRegionLocation(regionLocation);
+        }
+      }
+    }
+    if (regionLocation != null) {
+      regionStates.removeRegionFromServer(regionLocation, regionNode);
     }
   }
 
   // should be called within the synchronized block of RegionStateNode
   void regionOpened(RegionStateNode regionNode) throws IOException {
-    regionNode.transitionState(State.OPEN, RegionStates.STATES_EXPECTED_ON_OPEN);
     // TODO: OPENING Updates hbase:meta too... we need to do both here and there?
     // That is a lot of hbase:meta writing.
-    regionStateStore.updateRegionLocation(regionNode);
-
+    transitStateAndUpdate(regionNode, State.OPEN, RegionStates.STATES_EXPECTED_ON_OPEN);
     RegionInfo hri = regionNode.getRegionInfo();
     if (isMetaRegion(hri)) {
       // Usually we'd set a table ENABLED at this stage but hbase:meta is ALWAYs enabled, it
@@ -1460,7 +1483,7 @@ public class AssignmentManager implements ServerListener {
 
   // should be called within the synchronized block of RegionStateNode
   void regionClosing(RegionStateNode regionNode) throws IOException {
-    regionNode.transitionState(State.CLOSING, RegionStates.STATES_EXPECTED_ON_CLOSE);
+    transitStateAndUpdate(regionNode, State.CLOSING, RegionStates.STATES_EXPECTED_ON_CLOSE);
     regionStateStore.updateRegionLocation(regionNode);
 
     RegionInfo hri = regionNode.getRegionInfo();
@@ -1477,16 +1500,26 @@ public class AssignmentManager implements ServerListener {
   // The parameter 'normally' means whether we are closed cleanly, if it is true, then it means that
   // we are closed due to a RS crash.
   void regionClosed(RegionStateNode regionNode, boolean normally) throws IOException {
+    RegionState.State state = regionNode.getState();
+    ServerName regionLocation = regionNode.getRegionLocation();
     regionNode.transitionState(normally ? State.CLOSED : State.ABNORMALLY_CLOSED,
       RegionStates.STATES_EXPECTED_ON_CLOSE);
-    ServerName loc = regionNode.getRegionLocation();
-    if (loc != null) {
-      // could be a retry so add a check here to avoid set the lastHost to null.
-      regionNode.setLastHost(loc);
+    boolean succ = false;
+    try {
+      regionStateStore.updateRegionLocation(regionNode);
+      succ = true;
+    } finally {
+      if (!succ) {
+        // revert
+        regionNode.setState(state);
+        regionNode.setRegionLocation(regionLocation);
+      }
+    }
+    if (regionLocation != null) {
+      regionNode.setLastHost(regionLocation);
       regionNode.setRegionLocation(null);
-      regionStates.removeRegionFromServer(loc, regionNode);
+      regionStates.removeRegionFromServer(regionLocation, regionNode);
     }
-    regionStateStore.updateRegionLocation(regionNode);
   }
 
   public void markRegionAsSplit(final RegionInfo parent, final ServerName serverName,

http://git-wip-us.apache.org/repos/asf/hbase/blob/aac1a701/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
index e853b9b..0d2c4b2 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
@@ -326,13 +326,9 @@ public class TransitRegionStateProcedure
         "Failed transition, suspend {}secs {}; {}; waiting on rectified condition fixed " +
           "by other Procedure or operator intervention",
         backoff / 1000, this, regionNode.toShortString(), e);
-      regionNode.getProcedureEvent().suspend();
-      if (regionNode.getProcedureEvent().suspendIfNotReady(this)) {
-        setTimeout(Math.toIntExact(backoff));
-        setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
-        throw new ProcedureSuspendedException();
-      }
-      return Flow.HAS_MORE_STATE;
+      setTimeout(Math.toIntExact(backoff));
+      setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
+      throw new ProcedureSuspendedException();
     }
   }
 
@@ -342,7 +338,7 @@ public class TransitRegionStateProcedure
   @Override
   protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
     setState(ProcedureProtos.ProcedureState.RUNNABLE);
-    getRegionStateNode(env).getProcedureEvent().wake(env.getProcedureScheduler());
+    env.getProcedureScheduler().addFront(this);
     return false; // 'false' means that this procedure handled the timeout
   }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/aac1a701/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
index db7a872..1fcc6eb 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@@ -114,6 +114,17 @@ public class ServerCrashProcedure
       notifiedDeadServer = true;
     }
 
+    switch (state) {
+      case SERVER_CRASH_START:
+      case SERVER_CRASH_SPLIT_META_LOGS:
+      case SERVER_CRASH_ASSIGN_META:
+        break;
+      default:
+        // If hbase:meta is not assigned, yield.
+        if (env.getAssignmentManager().waitMetaLoaded(this)) {
+          throw new ProcedureSuspendedException();
+        }
+    }
     try {
       switch (state) {
         case SERVER_CRASH_START:
@@ -134,10 +145,6 @@ public class ServerCrashProcedure
           setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
           break;
         case SERVER_CRASH_GET_REGIONS:
-          // If hbase:meta is not assigned, yield.
-          if (env.getAssignmentManager().waitMetaLoaded(this)) {
-            throw new ProcedureSuspendedException();
-          }
           this.regionsOnCrashedServer =
             services.getAssignmentManager().getRegionStates().getServerRegionInfoSet(serverName);
           // Where to go next? Depends on whether we should split logs at all or

http://git-wip-us.apache.org/repos/asf/hbase/blob/aac1a701/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java
index 9b1f4ca..3573bd6 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.master.ServerManager;
 import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
 import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
 import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface;
@@ -149,6 +150,7 @@ public class TestCloseRegionWhileRSCrash {
 
   @BeforeClass
   public static void setUp() throws Exception {
+    UTIL.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
     UTIL.startMiniCluster(3);
     UTIL.createTable(TABLE_NAME, CF);
     UTIL.getAdmin().balancerSwitch(false, true);
@@ -174,7 +176,7 @@ public class TestCloseRegionWhileRSCrash {
     HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs);
     ProcedureExecutor<MasterProcedureEnv> procExec =
       UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
-    procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName()));
+    long dummyProcId = procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName()));
     ARRIVE.await();
     UTIL.getMiniHBaseCluster().killRegionServer(srcRs.getServerName());
     UTIL.waitFor(30000,
@@ -206,7 +208,14 @@ public class TestCloseRegionWhileRSCrash {
       }
       Thread.sleep(1000);
     }
+    // let's close the connection to make sure that the SCP can not update meta successfully
+    UTIL.getMiniHBaseCluster().getMaster().getConnection().close();
     RESUME.countDown();
+    UTIL.waitFor(30000, () -> procExec.isFinished(dummyProcId));
+    Thread.sleep(2000);
+    // here we restart
+    UTIL.getMiniHBaseCluster().stopMaster(0).join();
+    UTIL.getMiniHBaseCluster().startMaster();
     t.join();
     // Make sure that the region is online, it may not on the original target server, as we will set
     // forceNewPlan to true if there is a server crash