You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2018/08/14 23:29:38 UTC
hbase git commit: HBASE-20978 [amv2] Worker terminating UNNATURALLY
during MoveRegionProcedure Signed-off-by: Michael Stack
Signed-off-by: Duo Zhang
Repository: hbase
Updated Branches:
refs/heads/branch-2.0 d49fca134 -> 5e06b80e7
HBASE-20978 [amv2] Worker terminating UNNATURALLY during MoveRegionProcedure Signed-off-by: Michael Stack <st...@apache.org> Signed-off-by: Duo Zhang <zh...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/5e06b80e
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/5e06b80e
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/5e06b80e
Branch: refs/heads/branch-2.0
Commit: 5e06b80e75fb38d1d05a9c30af1fc44deec542d1
Parents: d49fca1
Author: Allan Yang <al...@apache.org>
Authored: Tue Aug 14 12:09:42 2018 -0700
Committer: Michael Stack <st...@apache.org>
Committed: Tue Aug 14 16:29:24 2018 -0700
----------------------------------------------------------------------
.../hbase/procedure2/ProcedureExecutor.java | 65 ++++++++++++++++++--
.../procedure2/ProcedureTestingUtility.java | 7 +++
.../hbase/procedure2/TestChildProcedures.java | 27 ++++++++
.../src/test/resources/hbase-site.xml | 4 ++
4 files changed, 99 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/5e06b80e/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
index 3c2998c..1e0ee79 100644
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
@@ -83,12 +83,34 @@ public class ProcedureExecutor<TEnvironment> {
"hbase.procedure.worker.keep.alive.time.msec";
private static final long DEFAULT_WORKER_KEEP_ALIVE_TIME = TimeUnit.MINUTES.toMillis(1);
+ /**
+ * {@link #testing} is non-null when ProcedureExecutor is being tested. Tests will try to
+ * break PE having it fail at various junctures. When non-null, testing is set to an instance of
+ * the below internal {@link Testing} class with flags set for the particular test.
+ */
Testing testing = null;
+
+ /**
+ * Class with parameters describing how to fail/die when in testing-context.
+ */
public static class Testing {
protected boolean killIfSuspended = false;
+
+ /**
+ * Kill the PE BEFORE we store state to the WAL. Good for figuring out if a Procedure is
+ * persisting all the state it needs to recover after a crash.
+ */
protected boolean killBeforeStoreUpdate = false;
protected boolean toggleKillBeforeStoreUpdate = false;
+ /**
+ * Set when we want to fail AFTER state has been stored into the WAL. Rarely used. HBASE-20978
+ * is about a case where memory-state was being set after store to WAL where a crash could
+ * cause us to get stuck. This flag allows killing at what was a vulnerable time.
+ */
+ protected boolean killAfterStoreUpdate = false;
+ protected boolean toggleKillAfterStoreUpdate = false;
+
protected boolean shouldKillBeforeStoreUpdate() {
final boolean kill = this.killBeforeStoreUpdate;
if (this.toggleKillBeforeStoreUpdate) {
@@ -101,6 +123,19 @@ public class ProcedureExecutor<TEnvironment> {
protected boolean shouldKillBeforeStoreUpdate(final boolean isSuspended) {
return (isSuspended && !killIfSuspended) ? false : shouldKillBeforeStoreUpdate();
}
+
+ protected boolean shouldKillAfterStoreUpdate() {
+ final boolean kill = this.killAfterStoreUpdate;
+ if (this.toggleKillAfterStoreUpdate) {
+ this.killAfterStoreUpdate = !kill;
+ LOG.warn("Toggle KILL after store update to: " + this.killAfterStoreUpdate);
+ }
+ return kill;
+ }
+
+ protected boolean shouldKillAfterStoreUpdate(final boolean isSuspended) {
+ return (isSuspended && !killIfSuspended) ? false : shouldKillAfterStoreUpdate();
+ }
}
public interface ProcedureExecutorListener {
@@ -503,6 +538,17 @@ public class ProcedureExecutor<TEnvironment> {
break;
case WAITING:
if (!proc.hasChildren()) {
+ // Normally, WAITING procedures should be waken by its children.
+ // But, there is a case that, all the children are successful and before
+ // they can wake up their parent procedure, the master was killed.
+ // So, during recovering the procedures from ProcedureWal, its children
+ // are not loaded because of their SUCCESS state.
+ // So we need to continue to run this WAITING procedure. But before
+ // executing, we need to set its state to RUNNABLE, otherwise, a exception
+ // will throw:
+ // Preconditions.checkArgument(procedure.getState() == ProcedureState.RUNNABLE,
+ // "NOT RUNNABLE! " + procedure.toString());
+ proc.setState(ProcedureState.RUNNABLE);
runnableList.add(proc);
}
break;
@@ -1562,10 +1608,7 @@ public class ProcedureExecutor<TEnvironment> {
// allows to kill the executor before something is stored to the wal.
// useful to test the procedure recovery.
if (testing != null && testing.shouldKillBeforeStoreUpdate(suspended)) {
- String msg = "TESTING: Kill before store update: " + procedure;
- LOG.debug(msg);
- stop();
- throw new RuntimeException(msg);
+ kill("TESTING: Kill BEFORE store update: " + procedure);
}
// TODO: The code here doesn't check if store is running before persisting to the store as
@@ -1591,6 +1634,14 @@ public class ProcedureExecutor<TEnvironment> {
assert (reExecute && subprocs == null) || !reExecute;
} while (reExecute);
+
+ // Allows to kill the executor after something is stored to the WAL but before the below
+ // state settings are done -- in particular the one on the end where we make parent
+ // RUNNABLE again when its children are done; see countDownChildren.
+ if (testing != null && testing.shouldKillAfterStoreUpdate(suspended)) {
+ kill("TESTING: Kill AFTER store update: " + procedure);
+ }
+
// Submit the new subprocedures
if (subprocs != null && !procedure.isFailed()) {
submitChildrenProcedures(subprocs);
@@ -1608,6 +1659,12 @@ public class ProcedureExecutor<TEnvironment> {
}
}
+ private void kill(String msg) {
+ LOG.debug(msg);
+ stop();
+ throw new RuntimeException(msg);
+ }
+
private Procedure<TEnvironment>[] initializeChildren(RootProcedureState<TEnvironment> procStack,
Procedure<TEnvironment> procedure, Procedure<TEnvironment>[] subprocs) {
assert subprocs != null : "expected subprocedures";
http://git-wip-us.apache.org/repos/asf/hbase/blob/5e06b80e/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java
index e8d72f9..138215b 100644
--- a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java
+++ b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java
@@ -167,6 +167,13 @@ public class ProcedureTestingUtility {
assertSingleExecutorForKillTests(procExecutor);
}
+ public static <TEnv> void toggleKillAfterStoreUpdate(ProcedureExecutor<TEnv> procExecutor) {
+ createExecutorTesting(procExecutor);
+ procExecutor.testing.killAfterStoreUpdate = !procExecutor.testing.killAfterStoreUpdate;
+ LOG.warn("Set Kill after store update to: " + procExecutor.testing.killAfterStoreUpdate);
+ assertSingleExecutorForKillTests(procExecutor);
+ }
+
public static <TEnv> void setKillAndToggleBeforeStoreUpdate(ProcedureExecutor<TEnv> procExecutor,
boolean value) {
ProcedureTestingUtility.setKillBeforeStoreUpdate(procExecutor, value);
http://git-wip-us.apache.org/repos/asf/hbase/blob/5e06b80e/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java
index cce4caf..4f3c443 100644
--- a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java
+++ b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java
@@ -109,6 +109,29 @@ public class TestChildProcedures {
ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId);
}
+
+ /**
+ * Test the state setting that happens after store to WAL; in particular the bit where we
+ * set the parent runnable again after its children have all completed successfully.
+ * See HBASE-20978.
+ */
+ @Test
+ public void testChildLoadWithRestartAfterChildSuccess() throws Exception {
+ procEnv.toggleKillAfterStoreUpdate = true;
+
+ TestRootProcedure proc = new TestRootProcedure();
+ long procId = ProcedureTestingUtility.submitAndWait(procExecutor, proc);
+ int restartCount = 0;
+ while (!procExecutor.isFinished(procId)) {
+ ProcedureTestingUtility.restart(procExecutor);
+ ProcedureTestingUtility.waitProcedure(procExecutor, proc);
+ restartCount++;
+ }
+ assertEquals(4, restartCount);
+ assertTrue("expected completed proc", procExecutor.isFinished(procId));
+ ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId);
+ }
+
@Test
public void testChildRollbackLoad() throws Exception {
procEnv.toggleKillBeforeStoreUpdate = false;
@@ -154,6 +177,9 @@ public class TestChildProcedures {
if (env.toggleKillBeforeStoreUpdate) {
ProcedureTestingUtility.toggleKillBeforeStoreUpdate(procExecutor);
}
+ if (env.toggleKillAfterStoreUpdate) {
+ ProcedureTestingUtility.toggleKillAfterStoreUpdate(procExecutor);
+ }
return new Procedure[] { new TestChildProcedure(), new TestChildProcedure() };
}
@@ -193,6 +219,7 @@ public class TestChildProcedures {
private static class TestProcEnv {
public boolean toggleKillBeforeStoreUpdate = false;
+ public boolean toggleKillAfterStoreUpdate = false;
public boolean triggerRollbackOnChild = false;
}
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/5e06b80e/hbase-procedure/src/test/resources/hbase-site.xml
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/test/resources/hbase-site.xml b/hbase-procedure/src/test/resources/hbase-site.xml
index 114ee8a..3709a71 100644
--- a/hbase-procedure/src/test/resources/hbase-site.xml
+++ b/hbase-procedure/src/test/resources/hbase-site.xml
@@ -22,6 +22,10 @@
-->
<configuration>
<property>
+ <name>hbase.defaults.for.version.skip</name>
+ <value>true</value>
+ </property>
+ <property>
<name>hbase.procedure.store.wal.use.hsync</name>
<value>false</value>
</property>