You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2018/08/14 23:29:38 UTC

hbase git commit: HBASE-20978 [amv2] Worker terminating UNNATURALLY during MoveRegionProcedure Signed-off-by: Michael Stack Signed-off-by: Duo Zhang

Repository: hbase
Updated Branches:
  refs/heads/branch-2.0 d49fca134 -> 5e06b80e7


HBASE-20978 [amv2] Worker terminating UNNATURALLY during MoveRegionProcedure Signed-off-by: Michael Stack <st...@apache.org> Signed-off-by: Duo Zhang <zh...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/5e06b80e
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/5e06b80e
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/5e06b80e

Branch: refs/heads/branch-2.0
Commit: 5e06b80e75fb38d1d05a9c30af1fc44deec542d1
Parents: d49fca1
Author: Allan Yang <al...@apache.org>
Authored: Tue Aug 14 12:09:42 2018 -0700
Committer: Michael Stack <st...@apache.org>
Committed: Tue Aug 14 16:29:24 2018 -0700

----------------------------------------------------------------------
 .../hbase/procedure2/ProcedureExecutor.java     | 65 ++++++++++++++++++--
 .../procedure2/ProcedureTestingUtility.java     |  7 +++
 .../hbase/procedure2/TestChildProcedures.java   | 27 ++++++++
 .../src/test/resources/hbase-site.xml           |  4 ++
 4 files changed, 99 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/5e06b80e/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
index 3c2998c..1e0ee79 100644
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
@@ -83,12 +83,34 @@ public class ProcedureExecutor<TEnvironment> {
       "hbase.procedure.worker.keep.alive.time.msec";
   private static final long DEFAULT_WORKER_KEEP_ALIVE_TIME = TimeUnit.MINUTES.toMillis(1);
 
+  /**
+   * {@link #testing} is non-null when ProcedureExecutor is being tested. Tests will try to
+   * break PE having it fail at various junctures. When non-null, testing is set to an instance of
+   * the below internal {@link Testing} class with flags set for the particular test.
+   */
   Testing testing = null;
+
+  /**
+   * Class with parameters describing how to fail/die when in testing-context.
+   */
   public static class Testing {
     protected boolean killIfSuspended = false;
+
+    /**
+     * Kill the PE BEFORE we store state to the WAL. Good for figuring out if a Procedure is
+     * persisting all the state it needs to recover after a crash.
+     */
     protected boolean killBeforeStoreUpdate = false;
     protected boolean toggleKillBeforeStoreUpdate = false;
 
+    /**
+     * Set when we want to fail AFTER state has been stored into the WAL. Rarely used. HBASE-20978
+     * is about a case where memory-state was being set after store to WAL where a crash could
+     * cause us to get stuck. This flag allows killing at what was a vulnerable time.
+     */
+    protected boolean killAfterStoreUpdate = false;
+    protected boolean toggleKillAfterStoreUpdate = false;
+
     protected boolean shouldKillBeforeStoreUpdate() {
       final boolean kill = this.killBeforeStoreUpdate;
       if (this.toggleKillBeforeStoreUpdate) {
@@ -101,6 +123,19 @@ public class ProcedureExecutor<TEnvironment> {
     protected boolean shouldKillBeforeStoreUpdate(final boolean isSuspended) {
       return (isSuspended && !killIfSuspended) ? false : shouldKillBeforeStoreUpdate();
     }
+
+    protected boolean shouldKillAfterStoreUpdate() {
+      final boolean kill = this.killAfterStoreUpdate;
+      if (this.toggleKillAfterStoreUpdate) {
+        this.killAfterStoreUpdate = !kill;
+        LOG.warn("Toggle KILL after store update to: " + this.killAfterStoreUpdate);
+      }
+      return kill;
+    }
+
+    protected boolean shouldKillAfterStoreUpdate(final boolean isSuspended) {
+      return (isSuspended && !killIfSuspended) ? false : shouldKillAfterStoreUpdate();
+    }
   }
 
   public interface ProcedureExecutorListener {
@@ -503,6 +538,17 @@ public class ProcedureExecutor<TEnvironment> {
           break;
         case WAITING:
           if (!proc.hasChildren()) {
+            // Normally, WAITING procedures should be waken by its children.
+            // But, there is a case that, all the children are successful and before
+            // they can wake up their parent procedure, the master was killed.
+            // So, during recovering the procedures from ProcedureWal, its children
+            // are not loaded because of their SUCCESS state.
+            // So we need to continue to run this WAITING procedure. But before
+            // executing, we need to set its state to RUNNABLE, otherwise, a exception
+            // will throw:
+            // Preconditions.checkArgument(procedure.getState() == ProcedureState.RUNNABLE,
+            // "NOT RUNNABLE! " + procedure.toString());
+            proc.setState(ProcedureState.RUNNABLE);
             runnableList.add(proc);
           }
           break;
@@ -1562,10 +1608,7 @@ public class ProcedureExecutor<TEnvironment> {
       // allows to kill the executor before something is stored to the wal.
       // useful to test the procedure recovery.
       if (testing != null && testing.shouldKillBeforeStoreUpdate(suspended)) {
-        String msg = "TESTING: Kill before store update: " + procedure;
-        LOG.debug(msg);
-        stop();
-        throw new RuntimeException(msg);
+        kill("TESTING: Kill BEFORE store update: " + procedure);
       }
 
       // TODO: The code here doesn't check if store is running before persisting to the store as
@@ -1591,6 +1634,14 @@ public class ProcedureExecutor<TEnvironment> {
 
       assert (reExecute && subprocs == null) || !reExecute;
     } while (reExecute);
+
+    // Allows to kill the executor after something is stored to the WAL but before the below
+    // state settings are done -- in particular the one on the end where we make parent
+    // RUNNABLE again when its children are done; see countDownChildren.
+    if (testing != null && testing.shouldKillAfterStoreUpdate(suspended)) {
+      kill("TESTING: Kill AFTER store update: " + procedure);
+    }
+
     // Submit the new subprocedures
     if (subprocs != null && !procedure.isFailed()) {
       submitChildrenProcedures(subprocs);
@@ -1608,6 +1659,12 @@ public class ProcedureExecutor<TEnvironment> {
     }
   }
 
+  private void kill(String msg) {
+    LOG.debug(msg);
+    stop();
+    throw new RuntimeException(msg);
+  }
+
   private Procedure<TEnvironment>[] initializeChildren(RootProcedureState<TEnvironment> procStack,
       Procedure<TEnvironment> procedure, Procedure<TEnvironment>[] subprocs) {
     assert subprocs != null : "expected subprocedures";

http://git-wip-us.apache.org/repos/asf/hbase/blob/5e06b80e/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java
index e8d72f9..138215b 100644
--- a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java
+++ b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java
@@ -167,6 +167,13 @@ public class ProcedureTestingUtility {
     assertSingleExecutorForKillTests(procExecutor);
   }
 
+  public static <TEnv> void toggleKillAfterStoreUpdate(ProcedureExecutor<TEnv> procExecutor) {
+    createExecutorTesting(procExecutor);
+    procExecutor.testing.killAfterStoreUpdate = !procExecutor.testing.killAfterStoreUpdate;
+    LOG.warn("Set Kill after store update to: " + procExecutor.testing.killAfterStoreUpdate);
+    assertSingleExecutorForKillTests(procExecutor);
+  }
+
   public static <TEnv> void setKillAndToggleBeforeStoreUpdate(ProcedureExecutor<TEnv> procExecutor,
       boolean value) {
     ProcedureTestingUtility.setKillBeforeStoreUpdate(procExecutor, value);

http://git-wip-us.apache.org/repos/asf/hbase/blob/5e06b80e/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java
index cce4caf..4f3c443 100644
--- a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java
+++ b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java
@@ -109,6 +109,29 @@ public class TestChildProcedures {
     ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId);
   }
 
+
+  /**
+   * Test the state setting that happens after store to WAL; in particular the bit where we
+   * set the parent runnable again after its children have all completed successfully.
+   * See HBASE-20978.
+   */
+  @Test
+  public void testChildLoadWithRestartAfterChildSuccess() throws Exception {
+    procEnv.toggleKillAfterStoreUpdate = true;
+
+    TestRootProcedure proc = new TestRootProcedure();
+    long procId = ProcedureTestingUtility.submitAndWait(procExecutor, proc);
+    int restartCount = 0;
+    while (!procExecutor.isFinished(procId)) {
+      ProcedureTestingUtility.restart(procExecutor);
+      ProcedureTestingUtility.waitProcedure(procExecutor, proc);
+      restartCount++;
+    }
+    assertEquals(4, restartCount);
+    assertTrue("expected completed proc", procExecutor.isFinished(procId));
+    ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId);
+  }
+
   @Test
   public void testChildRollbackLoad() throws Exception {
     procEnv.toggleKillBeforeStoreUpdate = false;
@@ -154,6 +177,9 @@ public class TestChildProcedures {
       if (env.toggleKillBeforeStoreUpdate) {
         ProcedureTestingUtility.toggleKillBeforeStoreUpdate(procExecutor);
       }
+      if (env.toggleKillAfterStoreUpdate) {
+        ProcedureTestingUtility.toggleKillAfterStoreUpdate(procExecutor);
+      }
       return new Procedure[] { new TestChildProcedure(), new TestChildProcedure() };
     }
 
@@ -193,6 +219,7 @@ public class TestChildProcedures {
 
   private static class TestProcEnv {
     public boolean toggleKillBeforeStoreUpdate = false;
+    public boolean toggleKillAfterStoreUpdate = false;
     public boolean triggerRollbackOnChild = false;
   }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/5e06b80e/hbase-procedure/src/test/resources/hbase-site.xml
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/test/resources/hbase-site.xml b/hbase-procedure/src/test/resources/hbase-site.xml
index 114ee8a..3709a71 100644
--- a/hbase-procedure/src/test/resources/hbase-site.xml
+++ b/hbase-procedure/src/test/resources/hbase-site.xml
@@ -22,6 +22,10 @@
 -->
 <configuration>
   <property>
+    <name>hbase.defaults.for.version.skip</name>
+    <value>true</value>
+  </property>
+  <property>
     <name>hbase.procedure.store.wal.use.hsync</name>
     <value>false</value>
   </property>