You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2018/05/01 00:51:45 UTC

hbase git commit: HBASE-20492 UnassignProcedure is stuck in retry loop on region stuck in OPENING state

Repository: hbase
Updated Branches:
  refs/heads/branch-2.0 73a59a930 -> 31e0cd42a


HBASE-20492 UnassignProcedure is stuck in retry loop on region stuck in OPENING state

Add backoff when stuck in RegionTransitionProcedure, the subclass of
AssignProcedure and UnassignProcedure. Can happen when we go to
transition but the current Region state is not what we expect.

M hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
 Add doc on being able to suspend and wait on a timeout.

M hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
 Add 'attempt' counter so we can do backoff when we get stuck.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
 Add persistence of new 'attempt' counter

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
 Doc data members that are persisted by subclasses given this is 'odd'.
 Add a counter for 'attempts' used when 'stuck' to implement backoff.
 Add suspend with timeout when 'stuck'. Add callback when timeout is
 exhausted which does wakeup of this procedure.

A hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestUnexpectedStateException.java
 Test of backoff.


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/31e0cd42
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/31e0cd42
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/31e0cd42

Branch: refs/heads/branch-2.0
Commit: 31e0cd42a23c1adeba4cf7f39b2ac619fe0dce40
Parents: 73a59a9
Author: Michael Stack <st...@apache.org>
Authored: Sat Apr 28 16:24:08 2018 -0700
Committer: Michael Stack <st...@apache.org>
Committed: Mon Apr 30 17:49:08 2018 -0700

----------------------------------------------------------------------
 .../hbase/procedure2/DelayedProcedure.java      |   3 +
 .../hadoop/hbase/procedure2/Procedure.java      |   7 +-
 .../hbase/procedure2/TestProcedureEvents.java   |   7 +
 .../src/main/protobuf/MasterProcedure.proto     |   4 +
 .../master/assignment/AssignProcedure.java      |  12 +-
 .../master/assignment/RegionStateStore.java     |   5 +-
 .../assignment/RegionTransitionProcedure.java   |  78 ++++++++-
 .../master/assignment/UnassignProcedure.java    |   6 +
 .../TestUnexpectedStateException.java           | 167 +++++++++++++++++++
 9 files changed, 279 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/31e0cd42/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/DelayedProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/DelayedProcedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/DelayedProcedure.java
index fcec0b7..a9f3e7d 100644
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/DelayedProcedure.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/DelayedProcedure.java
@@ -20,6 +20,9 @@ package org.apache.hadoop.hbase.procedure2;
 import org.apache.hadoop.hbase.procedure2.util.DelayedUtil;
 import org.apache.yetus.audience.InterfaceAudience;
 
+/**
+ * Vessel that carries a Procedure and a timeout.
+ */
 @InterfaceAudience.Private
 class DelayedProcedure extends DelayedUtil.DelayedContainerWithTimestamp<Procedure<?>> {
   public DelayedProcedure(Procedure<?> procedure) {

http://git-wip-us.apache.org/repos/asf/hbase/blob/31e0cd42/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
index a68417c..545bedf 100644
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
@@ -75,12 +75,17 @@ import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesti
  * during the execute() step. In case of failure and restart, rollback() may be
  * called multiple times, so again the code must be idempotent.
  *
- * <p>Procedure can be made respect a locking regime. It has acqure/release methods as
+ * <p>Procedure can be made respect a locking regime. It has acquire/release methods as
  * well as an {@link #hasLock(Object)}. The lock implementation is up to the implementor.
  * If an entity needs to be locked for the life of a procedure -- not just the calls to
  * execute -- then implementations should say so with the {@link #holdLock(Object)}
  * method.
  *
+ * <p>Procedures can be suspended or put in wait state with a callback that gets executed on
+ * Procedure-specified timeout. See {@link #setTimeout(int)}}, and
+ * {@link #setTimeoutFailure(Object)}. See TestProcedureEvents and the
+ * TestTimeoutEventProcedure class for an example usage.</p>
+ *
  * <p>There are hooks for collecting metrics on submit of the procedure and on finish.
  * See {@link #updateMetricsOnSubmit(Object)} and
  * {@link #updateMetricsOnFinish(Object, long, boolean)}.

http://git-wip-us.apache.org/repos/asf/hbase/blob/31e0cd42/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestProcedureEvents.java
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestProcedureEvents.java b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestProcedureEvents.java
index cd56c46..b7c59c8 100644
--- a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestProcedureEvents.java
+++ b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestProcedureEvents.java
@@ -80,6 +80,13 @@ public class TestProcedureEvents {
     fs.delete(logDir, true);
   }
 
+  /**
+   * Tests being able to suspend a Procedure for N timeouts and then failing.s
+   * Resets the timeout after each elapses. See {@link TestTimeoutEventProcedure} for example
+   * of how to do this sort of trickery with the ProcedureExecutor; i.e. suspend for a while,
+   * check for a condition and if not set, suspend again, etc., ultimately failing or succeeding
+   * eventually.
+   */
   @Test
   public void testTimeoutEventProcedure() throws Exception {
     final int NTIMEOUTS = 5;

http://git-wip-us.apache.org/repos/asf/hbase/blob/31e0cd42/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
----------------------------------------------------------------------
diff --git a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
index 7d1ca6b..3af4757 100644
--- a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
+++ b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
@@ -322,6 +322,8 @@ message AssignRegionStateData {
   required RegionInfo region_info = 2;
   optional bool force_new_plan = 3 [default = false];
   optional ServerName target_server = 4;
+  // Current attempt index used for expotential backoff when stuck
+  optional int32 attempt = 5;
 }
 
 message UnassignRegionStateData {
@@ -335,6 +337,8 @@ message UnassignRegionStateData {
   optional ServerName hosting_server = 5;
   optional bool force = 4 [default = false];
   optional bool remove_after_unassigning = 6 [default = false];
+  // Current attempt index used for expotential backoff when stuck
+  optional int32 attempt = 7;
 }
 
 enum MoveRegionState {

http://git-wip-us.apache.org/repos/asf/hbase/blob/31e0cd42/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
index 0ece343..b1d0b46 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
@@ -134,6 +134,9 @@ public class AssignProcedure extends RegionTransitionProcedure {
     if (this.targetServer != null) {
       state.setTargetServer(ProtobufUtil.toServerName(this.targetServer));
     }
+    if (getAttempt() > 0) {
+      state.setAttempt(getAttempt());
+    }
     serializer.serialize(state.build());
   }
 
@@ -147,6 +150,9 @@ public class AssignProcedure extends RegionTransitionProcedure {
     if (state.hasTargetServer()) {
       this.targetServer = ProtobufUtil.toServerName(state.getTargetServer());
     }
+    if (state.hasAttempt()) {
+      setAttempt(state.getAttempt());
+    }
   }
 
   @Override
@@ -185,10 +191,12 @@ public class AssignProcedure extends RegionTransitionProcedure {
       return false;
     }
 
-    // Send assign (add into assign-pool). Region is now in OFFLINE state. Setting offline state
-    // scrubs what was the old region location. Setting a new regionLocation here is how we retain
+    // Send assign (add into assign-pool). We call regionNode.offline below to set state to
+    // OFFLINE and to clear the region location. Setting a new regionLocation here is how we retain
     // old assignment or specify target server if a move or merge. See
     // AssignmentManager#processAssignQueue. Otherwise, balancer gives us location.
+    // TODO: Region will be set into OFFLINE state below regardless of what its previous state was
+    // This is dangerous? Wrong? What if region was in an unexpected state?
     ServerName lastRegionLocation = regionNode.offline();
     boolean retain = false;
     if (!forceNewPlan) {

http://git-wip-us.apache.org/repos/asf/hbase/blob/31e0cd42/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
index 40c7f2f..128e409 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
@@ -38,6 +38,7 @@ import org.apache.hadoop.hbase.client.Table;
 import org.apache.hadoop.hbase.client.TableDescriptor;
 import org.apache.hadoop.hbase.master.MasterServices;
 import org.apache.hadoop.hbase.master.RegionState.State;
+import org.apache.hadoop.hbase.procedure2.Procedure;
 import org.apache.hadoop.hbase.procedure2.util.StringUtils;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
@@ -133,7 +134,9 @@ public class RegionStateStore {
           regionStateNode.getOpenSeqNum() : HConstants.NO_SEQNUM;
       updateUserRegionLocation(regionStateNode.getRegionInfo(), regionStateNode.getState(),
           regionStateNode.getRegionLocation(), regionStateNode.getLastHost(), openSeqNum,
-          regionStateNode.getProcedure().getProcId());
+          // The regionStateNode may have no procedure in a test scenario; allow for this.
+          regionStateNode.getProcedure() != null?
+              regionStateNode.getProcedure().getProcId(): Procedure.NO_PROC_ID);
     }
   }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/31e0cd42/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
index 6c63cb8..7c99670 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
@@ -30,12 +30,14 @@ import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
 import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
 import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface;
 import org.apache.hadoop.hbase.procedure2.Procedure;
+import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
 import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
 import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation;
 import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure;
-import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
+import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -74,11 +76,18 @@ import org.slf4j.LoggerFactory;
  * intentionally not implemented. It is a 'one shot' procedure. See its class doc for how it
  * handles failure.
  * </li>
+ * <li>If we find a region in an 'unexpected' state, we'll complain and retry with backoff forever.
+ * The 'unexpected' state needs to be fixed either by another running Procedure or by operator
+ * intervention (Regions in 'unexpected' state indicates bug or unexpected transition type).
+ * For this to work, subclasses need to persist the 'attempt' counter kept in this class when
+ * they do serializeStateData and restore it inside their deserializeStateData, just as they do
+ * for {@link #regionInfo}.
+ * </li>
  * </ul>
  * </p>
  *
- * <p>TODO: Considering it is a priority doing all we can to get make a region available as soon as possible,
- * re-attempting with any target makes sense if specified target fails in case of
+ * <p>TODO: Considering it is a priority doing all we can to get make a region available as soon as
+ * possible, re-attempting with any target makes sense if specified target fails in case of
  * {@link AssignProcedure}. For {@link UnassignProcedure}, our concern is preventing data loss
  * on failed unassign. See class doc for explanation.
  */
@@ -92,7 +101,19 @@ public abstract class RegionTransitionProcedure
   protected final AtomicBoolean aborted = new AtomicBoolean(false);
 
   private RegionTransitionState transitionState = RegionTransitionState.REGION_TRANSITION_QUEUE;
+  /**
+   * This data member must be persisted. Expectation is that it is done by subclasses in their
+   * {@link #serializeStateData(ProcedureStateSerializer)} call, restoring {@link #regionInfo}
+   * in their {@link #deserializeStateData(ProcedureStateSerializer)} method.
+   */
   private RegionInfo regionInfo;
+
+  /**
+   * Like {@link #regionInfo}, the expectation is that subclasses persist the value of this
+   * data member. It is used doing backoff when Procedure gets stuck.
+   */
+  private int attempt;
+
   private volatile boolean lock = false;
 
   // Required by the Procedure framework to create the procedure on replay
@@ -107,11 +128,30 @@ public abstract class RegionTransitionProcedure
     return regionInfo;
   }
 
+  /**
+   * This setter is for subclasses to call in their
+   * {@link #deserializeStateData(ProcedureStateSerializer)} method. Expectation is that
+   * subclasses will persist `regioninfo` in their
+   * {@link #serializeStateData(ProcedureStateSerializer)} method and then restore `regionInfo` on
+   * deserialization by calling.
+   */
   protected void setRegionInfo(final RegionInfo regionInfo) {
-    // Setter is for deserialization.
     this.regionInfo = regionInfo;
   }
 
+  /**
+   * This setter is for subclasses to call in their
+   * {@link #deserializeStateData(ProcedureStateSerializer)} method.
+   * @see #setRegionInfo(RegionInfo)
+   */
+  protected void setAttempt(int attempt) {
+    this.attempt = attempt;
+  }
+
+  protected int getAttempt() {
+    return this.attempt;
+  }
+
   @Override
   public TableName getTableName() {
     RegionInfo hri = getRegionInfo();
@@ -328,14 +368,40 @@ public abstract class RegionTransitionProcedure
             return null;
         }
       } while (retry);
+      // If here, success so clear out the attempt counter so we start fresh each time we get stuck.
+      this.attempt = 0;
     } catch (IOException e) {
-      LOG.warn("Retryable error trying to transition: " +
-          this + "; " + regionNode.toShortString(), e);
+      long backoff = getBackoffTime(this.attempt++);
+      LOG.warn("Failed transition, suspend {}secs {}; {}; waiting on rectified condition fixed " +
+              "by other Procedure or operator intervention", backoff / 1000, this,
+          regionNode.toShortString(), e);
+      getRegionState(env).getProcedureEvent().suspend();
+      if (getRegionState(env).getProcedureEvent().suspendIfNotReady(this)) {
+        setTimeout(Math.toIntExact(backoff));
+        setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
+        throw new ProcedureSuspendedException();
+      }
     }
 
     return new Procedure[] {this};
   }
 
+  private long getBackoffTime(int attempts) {
+    long backoffTime = (long)(1000 * Math.pow(2, attempts));
+    long maxBackoffTime = 60 * 60 * 1000; // An hour. Hard-coded for for now.
+    return backoffTime < maxBackoffTime? backoffTime: maxBackoffTime;
+  }
+
+  /**
+   * At end of timeout, wake ourselves up so we run again.
+   */
+  @Override
+  protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
+    setState(ProcedureProtos.ProcedureState.RUNNABLE);
+    getRegionState(env).getProcedureEvent().wake(env.getProcedureScheduler());
+    return false; // 'false' means that this procedure handled the timeout
+  }
+
   @Override
   protected void rollback(final MasterProcedureEnv env) {
     if (isRollbackSupported(transitionState)) {

http://git-wip-us.apache.org/repos/asf/hbase/blob/31e0cd42/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
index 03f5213..bdbf003 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
@@ -153,6 +153,9 @@ public class UnassignProcedure extends RegionTransitionProcedure {
     if (removeAfterUnassigning) {
       state.setRemoveAfterUnassigning(true);
     }
+    if (getAttempt() > 0) {
+      state.setAttempt(getAttempt());
+    }
     serializer.serialize(state.build());
   }
 
@@ -169,6 +172,9 @@ public class UnassignProcedure extends RegionTransitionProcedure {
       this.destinationServer = ProtobufUtil.toServerName(state.getDestinationServer());
     }
     removeAfterUnassigning = state.getRemoveAfterUnassigning();
+    if (state.hasAttempt()) {
+      setAttempt(state.getAttempt());
+    }
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/hbase/blob/31e0cd42/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestUnexpectedStateException.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestUnexpectedStateException.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestUnexpectedStateException.java
new file mode 100644
index 0000000..0f62f8e
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestUnexpectedStateException.java
@@ -0,0 +1,167 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.assignment;
+
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.Threads;
+import org.apache.hbase.thirdparty.com.google.gson.JsonArray;
+import org.apache.hbase.thirdparty.com.google.gson.JsonElement;
+import org.apache.hbase.thirdparty.com.google.gson.JsonObject;
+import org.apache.hbase.thirdparty.com.google.gson.JsonParser;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.rules.TestName;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Tests for HBASE-18408 "AM consumes CPU and fills up the logs really fast when there is no RS to
+ * assign". If an {@link org.apache.hadoop.hbase.exceptions.UnexpectedStateException}, we'd spin on
+ * the ProcedureExecutor consuming CPU and filling logs. Test new back-off facility.
+ */
+@Category({MasterTests.class, MediumTests.class})
+public class TestUnexpectedStateException {
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+      HBaseClassTestRule.forClass(TestUnexpectedStateException.class);
+  @Rule public final TestName name = new TestName();
+
+  private static final Logger LOG = LoggerFactory.getLogger(TestUnexpectedStateException.class);
+  private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+  private static final byte [] FAMILY = Bytes.toBytes("family");
+  private TableName tableName;
+  private static final int REGIONS = 10;
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    TEST_UTIL.startMiniCluster();
+  }
+
+  @AfterClass
+  public static void afterClass() throws Exception {
+    TEST_UTIL.shutdownMiniCluster();
+  }
+
+  @Before
+  public void before() throws IOException {
+    this.tableName = TableName.valueOf(this.name.getMethodName());
+    TEST_UTIL.createMultiRegionTable(this.tableName, FAMILY, REGIONS);
+  }
+
+  private RegionInfo pickArbitraryRegion(Admin admin) throws IOException {
+    List<RegionInfo> regions = admin.getRegions(this.tableName);
+    return regions.get(3);
+  }
+
+  /**
+   * Manufacture a state that will throw UnexpectedStateException.
+   * Change an assigned region's 'state' to be OPENING. That'll mess up a subsequent unassign
+   * causing it to throw UnexpectedStateException. We can easily manufacture this infinite retry
+   * state in UnassignProcedure because it has no startTransition. AssignProcedure does where it
+   * squashes whatever the current region state is making it OFFLINE. That makes it harder to mess
+   * it up. Make do with UnassignProcedure for now.
+   */
+  @Test
+  public void testUnableToAssign() throws Exception {
+    try (Admin admin = TEST_UTIL.getAdmin()) {
+      // Pick a random region from this tests' table to play with. Get its RegionStateNode.
+      // Clone it because the original will be changed by the system. We need clone to fake out
+      // a state.
+      final RegionInfo region = pickArbitraryRegion(admin);
+      AssignmentManager am = TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
+      RegionStates.RegionStateNode rsn =  am.getRegionStates().getRegionStateNode(region);
+      // Now force region to be in OPENING state.
+      am.markRegionAsOpening(rsn);
+      // Now the 'region' is in an artificially bad state, try an unassign again.
+      // Run unassign in a thread because it is blocking.
+      Runnable unassign = () -> {
+        try {
+          admin.unassign(region.getRegionName(), true);
+        } catch (IOException ioe) {
+          fail("Failed assign");
+        }
+      };
+      Thread t = new Thread(unassign, "unassign");
+      t.start();
+      while(!t.isAlive()) {
+        Threads.sleep(100);
+      }
+      Threads.sleep(1000);
+      // Unassign should be running and failing. Look for incrementing timeout as evidence that
+      // Unassign is stuck and doing backoff.
+      // Now fix the condition we were waiting on so the unassign can complete.
+      JsonParser parser = new JsonParser();
+      long oldTimeout = 0;
+      int timeoutIncrements = 0;
+      while (true) {
+        long timeout = getUnassignTimeout(parser, admin.getProcedures());
+        if (timeout > oldTimeout) {
+          LOG.info("Timeout incremented, was {}, now is {}, increments={}",
+              timeout, oldTimeout, timeoutIncrements);
+          oldTimeout = timeout;
+          timeoutIncrements++;
+          if (timeoutIncrements > 3) {
+            // If we incremented at least twice, break; the backoff is working.
+            break;
+          }
+        }
+        Thread.sleep(1000);
+      }
+      am.markRegionAsOpened(rsn);
+      t.join();
+    }
+  }
+
+  /**
+   * @param proceduresAsJSON This is String returned by admin.getProcedures call... an array of
+   *                         Procedures as JSON.
+   * @return The Procedure timeout value parsed from the Unassign Procedure.
+   * @Exception Thrown if we do not find UnassignProcedure or fail to parse timeout.
+   */
+  private long getUnassignTimeout(JsonParser parser, String proceduresAsJSON) throws Exception {
+    JsonArray array = parser.parse(proceduresAsJSON).getAsJsonArray();
+    Iterator<JsonElement> iterator = array.iterator();
+    while (iterator.hasNext()) {
+      JsonElement element = iterator.next();
+      JsonObject obj = element.getAsJsonObject();
+      String className = obj.get("className").getAsString();
+      String actualClassName = UnassignProcedure.class.getName();
+      if (className.equals(actualClassName)) {
+        return obj.get("timeout").getAsLong();
+      }
+    }
+    throw new Exception("Failed to find UnassignProcedure or timeout in " + proceduresAsJSON);
+  }
+}