You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zh...@apache.org on 2018/11/19 01:47:04 UTC

hbase git commit: HBASE-21494 NPE when loading RecoverStandByProcedure

Repository: hbase
Updated Branches:
  refs/heads/master f555258e7 -> b329e6e3f


HBASE-21494 NPE when loading RecoverStandByProcedure


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/b329e6e3
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/b329e6e3
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/b329e6e3

Branch: refs/heads/master
Commit: b329e6e3f271bc22ec4a6f4dd71a8e8b422db3d0
Parents: f555258
Author: zhangduo <zh...@apache.org>
Authored: Sun Nov 18 15:58:53 2018 +0800
Committer: Duo Zhang <zh...@apache.org>
Committed: Mon Nov 19 09:35:18 2018 +0800

----------------------------------------------------------------------
 .../org/apache/hadoop/hbase/master/HMaster.java |   4 +-
 .../SyncReplicationReplayWALManager.java        |   4 +-
 .../TestRegisterPeerWorkerWhenRestarting.java   | 127 +++++++++++++++++++
 ...tPeerSyncReplicationStateProcedureRetry.java |   5 +
 4 files changed, 137 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/b329e6e3/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index 31dc208..e1d3740 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -778,7 +778,6 @@ public class HMaster extends HRegionServer implements MasterServices {
     this.splitOrMergeTracker.start();
 
     this.replicationPeerManager = ReplicationPeerManager.create(zooKeeper, conf);
-    this.syncReplicationReplayWALManager = new SyncReplicationReplayWALManager(this);
 
     this.drainingServerTracker = new DrainingServerTracker(zooKeeper, this, this.serverManager);
     this.drainingServerTracker.start();
@@ -949,7 +948,10 @@ public class HMaster extends HRegionServer implements MasterServices {
     }
 
     status.setStatus("Initialize ServerManager and schedule SCP for crash servers");
+    // The below two managers must be created before loading procedures, as they will be used during
+    // loading.
     this.serverManager = createServerManager(this);
+    this.syncReplicationReplayWALManager = new SyncReplicationReplayWALManager(this);
     createProcedureExecutor();
     @SuppressWarnings("rawtypes")
     Map<Class<? extends Procedure>, List<Procedure<MasterProcedureEnv>>> procsByType =

http://git-wip-us.apache.org/repos/asf/hbase/blob/b329e6e3/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/SyncReplicationReplayWALManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/SyncReplicationReplayWALManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/SyncReplicationReplayWALManager.java
index 89e97bb..ae624b1 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/SyncReplicationReplayWALManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/SyncReplicationReplayWALManager.java
@@ -146,12 +146,12 @@ public class SyncReplicationReplayWALManager {
     this.fs = services.getMasterFileSystem().getWALFileSystem();
     this.walRootDir = services.getMasterFileSystem().getWALRootDir();
     this.remoteWALDir = new Path(this.walRootDir, ReplicationUtils.REMOTE_WAL_DIR_NAME);
-    MasterProcedureScheduler scheduler =
-      services.getMasterProcedureExecutor().getEnvironment().getProcedureScheduler();
     serverManager.registerListener(new ServerListener() {
 
       @Override
       public void serverAdded(ServerName serverName) {
+        MasterProcedureScheduler scheduler =
+          services.getMasterProcedureExecutor().getEnvironment().getProcedureScheduler();
         for (UsedReplayWorkersForPeer usedWorkers : usedWorkersByPeer.values()) {
           synchronized (usedWorkers) {
             usedWorkers.wake(scheduler);

http://git-wip-us.apache.org/repos/asf/hbase/blob/b329e6e3/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestRegisterPeerWorkerWhenRestarting.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestRegisterPeerWorkerWhenRestarting.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestRegisterPeerWorkerWhenRestarting.java
new file mode 100644
index 0000000..72aa32d
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestRegisterPeerWorkerWhenRestarting.java
@@ -0,0 +1,127 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.replication;
+
+import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RecoverStandbyState.DISPATCH_WALS_VALUE;
+import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RecoverStandbyState.UNREGISTER_PEER_FROM_WORKER_STORAGE_VALUE;
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.master.HMaster;
+import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
+import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
+import org.apache.hadoop.hbase.replication.SyncReplicationState;
+import org.apache.hadoop.hbase.replication.SyncReplicationTestBase;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
+import org.apache.zookeeper.KeeperException;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Testcase for HBASE-21494.
+ */
+@Category({ MasterTests.class, LargeTests.class })
+public class TestRegisterPeerWorkerWhenRestarting extends SyncReplicationTestBase {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestRegisterPeerWorkerWhenRestarting.class);
+
+  private static volatile boolean FAIL = false;
+
+  public static final class HMasterForTest extends HMaster {
+
+    public HMasterForTest(Configuration conf) throws IOException, KeeperException {
+      super(conf);
+    }
+
+    @Override
+    public void remoteProcedureCompleted(long procId) {
+      if (FAIL && getMasterProcedureExecutor()
+        .getProcedure(procId) instanceof SyncReplicationReplayWALRemoteProcedure) {
+        throw new RuntimeException("Inject error");
+      }
+      super.remoteProcedureCompleted(procId);
+    }
+  }
+
+  @BeforeClass
+  public static void setUp() throws Exception {
+    UTIL2.getConfiguration().setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class);
+    SyncReplicationTestBase.setUp();
+  }
+
+  @Test
+  public void testRestart() throws Exception {
+    UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID,
+      SyncReplicationState.STANDBY);
+    UTIL1.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID,
+      SyncReplicationState.ACTIVE);
+
+    UTIL1.getAdmin().disableReplicationPeer(PEER_ID);
+    write(UTIL1, 0, 100);
+    Thread.sleep(2000);
+    // peer is disabled so no data have been replicated
+    verifyNotReplicatedThroughRegion(UTIL2, 0, 100);
+
+    // transit the A to DA first to avoid too many error logs.
+    UTIL1.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID,
+      SyncReplicationState.DOWNGRADE_ACTIVE);
+    HMaster master = UTIL2.getHBaseCluster().getMaster();
+    // make sure the transiting can not succeed
+    FAIL = true;
+    ProcedureExecutor<MasterProcedureEnv> procExec = master.getMasterProcedureExecutor();
+    Thread t = new Thread() {
+
+      @Override
+      public void run() {
+        try {
+          UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID,
+            SyncReplicationState.DOWNGRADE_ACTIVE);
+        } catch (IOException e) {
+          throw new UncheckedIOException(e);
+        }
+      }
+    };
+    t.start();
+    // wait until we are in the states where we need to register peer worker when restarting
+    UTIL2.waitFor(60000,
+      () -> procExec.getProcedures().stream().filter(p -> p instanceof RecoverStandbyProcedure)
+        .map(p -> (RecoverStandbyProcedure) p)
+        .anyMatch(p -> p.getCurrentStateId() == DISPATCH_WALS_VALUE ||
+          p.getCurrentStateId() == UNREGISTER_PEER_FROM_WORKER_STORAGE_VALUE));
+    // failover to another master
+    MasterThread mt = UTIL2.getMiniHBaseCluster().getMasterThread();
+    mt.getMaster().abort("for testing");
+    mt.join();
+    FAIL = false;
+    t.join();
+    // make sure the new master can finish the transiting
+    assertEquals(SyncReplicationState.DOWNGRADE_ACTIVE,
+      UTIL2.getAdmin().getReplicationPeerSyncReplicationState(PEER_ID));
+    verify(UTIL2, 0, 100);
+  }
+}

http://git-wip-us.apache.org/repos/asf/hbase/blob/b329e6e3/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestTransitPeerSyncReplicationStateProcedureRetry.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestTransitPeerSyncReplicationStateProcedureRetry.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestTransitPeerSyncReplicationStateProcedureRetry.java
index 1c4a819..9b73039 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestTransitPeerSyncReplicationStateProcedureRetry.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestTransitPeerSyncReplicationStateProcedureRetry.java
@@ -17,6 +17,8 @@
  */
 package org.apache.hadoop.hbase.master.replication;
 
+import static org.junit.Assert.assertEquals;
+
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import org.apache.hadoop.hbase.HBaseClassTestRule;
@@ -90,5 +92,8 @@ public class TestTransitPeerSyncReplicationStateProcedureRetry extends SyncRepli
       .mapToLong(Procedure::getProcId).min().getAsLong();
     MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId);
     ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, false);
+    assertEquals(SyncReplicationState.DOWNGRADE_ACTIVE,
+      UTIL2.getAdmin().getReplicationPeerSyncReplicationState(PEER_ID));
+    verify(UTIL2, 0, 100);
   }
 }