You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by hu...@apache.org on 2021/03/18 18:36:08 UTC

[hbase] branch branch-2.3 updated: HBASE-25639 meta replica state is not respected during active master switch (#3052)

This is an automated email from the ASF dual-hosted git repository.

huaxiangsun pushed a commit to branch branch-2.3
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.3 by this push:
     new 0a4ddd6  HBASE-25639 meta replica state is not respected during active master switch (#3052)
0a4ddd6 is described below

commit 0a4ddd6c3bcf50ce7f3563e1699145d38cdf9de0
Author: huaxiangsun <hu...@apache.org>
AuthorDate: Thu Mar 18 11:35:46 2021 -0700

    HBASE-25639 meta replica state is not respected during active master switch (#3052)
    
    Signed-off-by: stack <st...@duboce.net>
---
 .../org/apache/hadoop/hbase/master/HMaster.java    |   2 +-
 .../hadoop/hbase/master/MasterMetaBootstrap.java   |  27 +++---
 .../hbase/master/assignment/AssignmentManager.java |  36 +++++---
 .../master/TestMasterFailoverWithMetaReplica.java  | 102 +++++++++++++++++++++
 4 files changed, 138 insertions(+), 29 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index b78f7f3..5f8b63d 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -1018,7 +1018,7 @@ public class HMaster extends HRegionServer implements MasterServices {
     RegionState rs = this.assignmentManager.getRegionStates().
         getRegionState(RegionInfoBuilder.FIRST_META_REGIONINFO);
     LOG.info("hbase:meta {}", rs);
-    if (rs != null && rs.isOffline()) {
+    if ((rs == null) || (rs != null && rs.isOffline())) {
       Optional<InitMetaProcedure> optProc = procedureExecutor.getProcedures().stream()
         .filter(p -> p instanceof InitMetaProcedure).map(o -> (InitMetaProcedure) o).findAny();
       initMetaProc = optProc.orElseGet(() -> {
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java
index da8d228..800ae97 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java
@@ -21,7 +21,6 @@ package org.apache.hadoop.hbase.master;
 import java.io.IOException;
 import java.util.List;
 import org.apache.hadoop.hbase.HConstants;
-import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.client.RegionInfoBuilder;
 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
@@ -61,25 +60,25 @@ class MasterMetaBootstrap {
       throw new IllegalStateException("hbase:meta must be initialized first before we can " +
           "assign out its replicas");
     }
-    ServerName metaServername = MetaTableLocator.getMetaRegionLocation(this.master.getZooKeeper());
+
     for (int i = 1; i < numReplicas; i++) {
       // Get current meta state for replica from zk.
-      RegionState metaState = MetaTableLocator.getMetaRegionState(master.getZooKeeper(), i);
       RegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(
-          RegionInfoBuilder.FIRST_META_REGIONINFO, i);
-      LOG.debug(hri.getRegionNameAsString() + " replica region state from zookeeper=" + metaState);
-      if (metaServername.equals(metaState.getServerName())) {
-        metaState = null;
-        LOG.info(hri.getRegionNameAsString() +
-          " old location is same as current hbase:meta location; setting location as null...");
-      }
+        RegionInfoBuilder.FIRST_META_REGIONINFO, i);
+
+      RegionState rs = assignmentManager.getRegionStates().getRegionState(hri);
+      LOG.debug(hri.getRegionNameAsString() + " replica region state from zookeeper=" + rs);
+
       // These assigns run inline. All is blocked till they complete. Only interrupt is shutting
       // down hosting server which calls AM#stop.
-      if (metaState != null && metaState.getServerName() != null) {
-        // Try to retain old assignment.
-        assignmentManager.assignAsync(hri, metaState.getServerName());
-      } else {
+      if (rs == null) {
         assignmentManager.assignAsync(hri);
+      } else if (rs != null && rs.isOffline()) {
+        if (rs.getServerName() != null) {
+          assignmentManager.assignAsync(hri, rs.getServerName());
+        } else {
+          assignmentManager.assignAsync(hri);
+        }
       }
     }
     unassignExcessMetaReplica(numReplicas);
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index edc7dee..880de2a 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -45,6 +45,7 @@ import org.apache.hadoop.hbase.client.DoNotRetryRegionException;
 import org.apache.hadoop.hbase.client.MasterSwitchType;
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.client.RegionInfoBuilder;
+import org.apache.hadoop.hbase.client.RegionReplicaUtil;
 import org.apache.hadoop.hbase.client.RegionStatesCount;
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.client.TableState;
@@ -225,21 +226,28 @@ public class AssignmentManager {
     ZKWatcher zkw = master.getZooKeeper();
     // it could be null in some tests
     if (zkw != null) {
-      // here we are still in the early steps of active master startup. There is only one thread(us)
-      // can access AssignmentManager and create region node, so here we do not need to lock the
-      // region node.
-      RegionState regionState = MetaTableLocator.getMetaRegionState(zkw);
-      RegionStateNode regionNode =
-        regionStates.getOrCreateRegionStateNode(RegionInfoBuilder.FIRST_META_REGIONINFO);
-      regionNode.setRegionLocation(regionState.getServerName());
-      regionNode.setState(regionState.getState());
-      if (regionNode.getProcedure() != null) {
-        regionNode.getProcedure().stateLoaded(this, regionNode);
-      }
-      if (regionState.getServerName() != null) {
-        regionStates.addRegionToServer(regionNode);
+      List<String> metaZNodes = zkw.getMetaReplicaNodes();
+      LOG.debug("hbase:meta replica znodes: {}", metaZNodes);
+      for (String metaZNode : metaZNodes) {
+        int replicaId = zkw.getZNodePaths().getMetaReplicaIdFromZnode(metaZNode);
+        // here we are still in the early steps of active master startup. There is only one thread(us)
+        // can access AssignmentManager and create region node, so here we do not need to lock the
+        // region node.
+        RegionState regionState = MetaTableLocator.getMetaRegionState(zkw, replicaId);
+        RegionStateNode regionNode = regionStates.getOrCreateRegionStateNode(regionState.getRegion());
+        regionNode.setRegionLocation(regionState.getServerName());
+        regionNode.setState(regionState.getState());
+        if (regionNode.getProcedure() != null) {
+          regionNode.getProcedure().stateLoaded(this, regionNode);
+        }
+        if (regionState.getServerName() != null) {
+          regionStates.addRegionToServer(regionNode);
+        }
+        if (RegionReplicaUtil.isDefaultReplica(replicaId)) {
+          setMetaAssigned(regionState.getRegion(), regionState.getState() == State.OPEN);
+        }
+        LOG.debug("Loaded hbase:meta {}", regionNode);
       }
-      setMetaAssigned(regionState.getRegion(), regionState.getState() == State.OPEN);
     }
   }
 
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailoverWithMetaReplica.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailoverWithMetaReplica.java
new file mode 100644
index 0000000..c58dc06
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailoverWithMetaReplica.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import java.util.List;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.MiniHBaseCluster;
+import org.apache.hadoop.hbase.StartMiniClusterOption;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.RegionInfoBuilder;
+import org.apache.hadoop.hbase.client.RegionReplicaUtil;
+import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.util.JVMClusterUtil;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category({MasterTests.class, LargeTests.class})
+public class TestMasterFailoverWithMetaReplica {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+      HBaseClassTestRule.forClass(TestMasterFailoverWithMetaReplica.class);
+  private static final int num_of_meta_replica = 2;
+
+  /**
+   * Test that during master failover, there is no state change for meta replica regions
+   *
+   * @throws Exception
+   */
+  @Test
+  public void testMasterFailoverWithMetaReplica() throws Exception {
+    // Start the cluster
+    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+    TEST_UTIL.getConfiguration().setInt(HConstants.META_REPLICAS_NUM, num_of_meta_replica);
+
+    StartMiniClusterOption option = StartMiniClusterOption.builder()
+        .numMasters(2).numRegionServers(num_of_meta_replica).build();
+    TEST_UTIL.startMiniCluster(option);
+    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+
+    assertTrue(cluster.waitForActiveAndReadyMaster());
+    HMaster oldMaster = cluster.getMaster();
+
+    // Make sure meta replica regions are assigned.
+    for (int replicaId = 1; replicaId < num_of_meta_replica; replicaId++) {
+      RegionInfo h = RegionReplicaUtil
+        .getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO, replicaId);
+      AssignmentTestingUtil.waitForAssignment(oldMaster.getAssignmentManager(), h);
+    }
+
+    int oldProcedureNum = oldMaster.getProcedures().size();
+
+    int activeIndex = -1;
+    List<JVMClusterUtil.MasterThread> masterThreads = cluster.getMasterThreads();
+
+    for (int i = 0; i < masterThreads.size(); i++) {
+      if (masterThreads.get(i).getMaster().isActiveMaster()) {
+        activeIndex = i;
+      }
+    }
+
+    // Stop the active master and wait for new master to come online.
+    cluster.stopMaster(activeIndex);
+    cluster.waitOnMaster(activeIndex);
+    assertTrue(cluster.waitForActiveAndReadyMaster());
+    // double check this is actually a new master
+    HMaster newMaster = cluster.getMaster();
+    assertFalse(oldMaster == newMaster);
+
+    int newProcedureNum = newMaster.getProcedures().size();
+
+    // Make sure all region servers report back and there is no new procedures.
+    assertEquals(newMaster.getServerManager().getOnlineServers().size(), num_of_meta_replica);
+    assertEquals(oldProcedureNum, newProcedureNum);
+
+    // Stop the cluster
+    TEST_UTIL.shutdownMiniCluster();
+  }
+}