You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by wc...@apache.org on 2020/06/26 15:52:12 UTC

[hbase] branch branch-2 updated: HBASE-24562: Stabilize master startup with meta replicas enabled (#1903)

This is an automated email from the ASF dual-hosted git repository.

wchevreuil pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2 by this push:
     new 50d1a79  HBASE-24562: Stabilize master startup with meta replicas enabled (#1903)
50d1a79 is described below

commit 50d1a7925579836b5a4a20b5a00d5fce8d11e474
Author: BukrosSzabolcs <bu...@gmail.com>
AuthorDate: Wed Jun 24 19:38:36 2020 +0200

    HBASE-24562: Stabilize master startup with meta replicas enabled (#1903)
    
    Signed-off-by: Wellington Chevreuil <wc...@apache.org>
    Signed-off-by: Huaxiang Sun <hu...@apache.com>
    (cherry picked from commit 8cdb2cca4461d6adad3f44af001055848a205370)
---
 .../org/apache/hadoop/hbase/master/HMaster.java    |  6 +-
 .../hadoop/hbase/master/MasterMetaBootstrap.java   |  4 +-
 .../hbase/master/assignment/AssignmentManager.java | 34 ++++++++-
 .../hadoop/hbase/client/TestMetaWithReplicas.java  | 83 ++++++++++++++++++++--
 4 files changed, 115 insertions(+), 12 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index 7f6b78c..bbede79 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -1140,7 +1140,11 @@ public class HMaster extends HRegionServer implements MasterServices {
     assignmentManager.checkIfShouldMoveSystemRegionAsync();
     status.setStatus("Assign meta replicas");
     MasterMetaBootstrap metaBootstrap = createMetaBootstrap();
-    metaBootstrap.assignMetaReplicas();
+    try {
+      metaBootstrap.assignMetaReplicas();
+    } catch (IOException | KeeperException e){
+      LOG.error("Assigning meta replica failed: ", e);
+    }
     status.setStatus("Starting quota manager");
     initQuotaManager();
     if (QuotaUtil.isQuotaEnabled(conf)) {
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java
index e57817e..1cf6cf1 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java
@@ -81,9 +81,9 @@ class MasterMetaBootstrap {
       // down hosting server which calls AM#stop.
       if (metaState != null && metaState.getServerName() != null) {
         // Try to retain old assignment.
-        assignmentManager.assign(hri, metaState.getServerName());
+        assignmentManager.assignAsync(hri, metaState.getServerName());
       } else {
-        assignmentManager.assign(hri);
+        assignmentManager.assignAsync(hri);
       }
     }
     unassignExcessMetaReplica(numReplicas);
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index 974202c..04529f0 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -589,9 +589,9 @@ public class AssignmentManager {
     }
   }
 
-  // TODO: Need an async version of this for hbck2.
-  public long assign(RegionInfo regionInfo, ServerName sn) throws IOException {
-    // TODO: should we use getRegionStateNode?
+  private TransitRegionStateProcedure createAssignProcedure(RegionInfo regionInfo, ServerName sn)
+    throws IOException {
+     // TODO: should we use getRegionStateNode?
     RegionStateNode regionNode = regionStates.getOrCreateRegionStateNode(regionInfo);
     TransitRegionStateProcedure proc;
     regionNode.lock();
@@ -602,6 +602,12 @@ public class AssignmentManager {
     } finally {
       regionNode.unlock();
     }
+    return proc;
+  }
+
+  // TODO: Need an async version of this for hbck2.
+  public long assign(RegionInfo regionInfo, ServerName sn) throws IOException {
+    TransitRegionStateProcedure proc = createAssignProcedure(regionInfo, sn);
     ProcedureSyncWait.submitAndWaitProcedure(master.getMasterProcedureExecutor(), proc);
     return proc.getProcId();
   }
@@ -610,6 +616,28 @@ public class AssignmentManager {
     return assign(regionInfo, null);
   }
 
+  /**
+   * Submits a procedure that assigns a region to a target server without waiting for it to finish
+   * @param regionInfo the region we would like to assign
+   * @param sn target server name
+   * @return
+   * @throws IOException
+   */
+  public Future<byte[]> assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException {
+    TransitRegionStateProcedure proc = createAssignProcedure(regionInfo, sn);
+    return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
+  }
+
+  /**
+   * Submits a procedure that assigns a region without waiting for it to finish
+   * @param regionInfo the region we would like to assign
+   * @return
+   * @throws IOException
+   */
+  public Future<byte[]> assignAsync(RegionInfo regionInfo) throws IOException {
+    return assignAsync(regionInfo, null);
+  }
+
   public long unassign(RegionInfo regionInfo) throws IOException {
     RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo);
     if (regionNode == null) {
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java
index 5db646f..a5a98a6 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java
@@ -23,12 +23,14 @@ import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
+import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.EnumSet;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
+import java.util.concurrent.Future;
 import java.util.concurrent.atomic.AtomicBoolean;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.Abortable;
@@ -42,9 +44,12 @@ import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.StartMiniClusterOption;
 import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.TableNotFoundException;
+import org.apache.hadoop.hbase.master.HMaster;
+import org.apache.hadoop.hbase.master.MasterServices;
 import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
 import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
-import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
+import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
+import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
 import org.apache.hadoop.hbase.regionserver.StorefileRefresherChore;
 import org.apache.hadoop.hbase.testclassification.LargeTests;
 import org.apache.hadoop.hbase.util.Bytes;
@@ -54,6 +59,7 @@ import org.apache.hadoop.hbase.zookeeper.ZKUtil;
 import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
 import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
 import org.junit.After;
+import org.junit.Assert;
 import org.junit.Before;
 import org.junit.ClassRule;
 import org.junit.Rule;
@@ -63,6 +69,8 @@ import org.junit.rules.TestName;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
+
 /**
  * Tests the scenarios where replicas are enabled for the meta table
  */
@@ -163,7 +171,7 @@ public class TestMetaWithReplicas {
         conf.get("zookeeper.znode.metaserver", "meta-region-server"));
     // check that the data in the znode is parseable (this would also mean the znode exists)
     byte[] data = ZKUtil.getData(zkw, primaryMetaZnode);
-    ProtobufUtil.toServerName(data);
+    ProtobufUtil.parseServerNameFrom(data);
     for (int i = 1; i < 3; i++) {
       String secZnode = ZNodePaths.joinZNode(baseZNode,
           conf.get("zookeeper.znode.metaserver", "meta-region-server") + "-" + i);
@@ -171,7 +179,7 @@ public class TestMetaWithReplicas {
       assertTrue(str.equals(secZnode));
       // check that the data in the znode is parseable (this would also mean the znode exists)
       data = ZKUtil.getData(zkw, secZnode);
-      ProtobufUtil.toServerName(data);
+      ProtobufUtil.parseServerNameFrom(data);
     }
   }
 
@@ -198,7 +206,7 @@ public class TestMetaWithReplicas {
     String primaryMetaZnode = ZNodePaths.joinZNode(baseZNode,
         conf.get("zookeeper.znode.metaserver", "meta-region-server"));
     byte[] data = ZKUtil.getData(zkw, primaryMetaZnode);
-    ServerName primary = ProtobufUtil.toServerName(data);
+    ServerName primary = ProtobufUtil.parseServerNameFrom(data);
     LOG.info("Primary=" + primary.toString());
 
     TableName TABLE = TableName.valueOf("testShutdownHandling");
@@ -304,7 +312,7 @@ public class TestMetaWithReplicas {
         conf.get("zookeeper.znode.metaserver", "meta-region-server"));
     // check that the data in the znode is parseable (this would also mean the znode exists)
     byte[] data = ZKUtil.getData(zkw, primaryMetaZnode);
-    ServerName currentServer = ProtobufUtil.toServerName(data);
+    ServerName currentServer = ProtobufUtil.parseServerNameFrom(data);
     Collection<ServerName> liveServers = TEST_UTIL.getAdmin()
         .getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS)).getLiveServerMetrics().keySet();
     ServerName moveToServer = null;
@@ -326,7 +334,7 @@ public class TestMetaWithReplicas {
     do {
       Thread.sleep(10);
       data = ZKUtil.getData(zkw, primaryMetaZnode);
-      currentServer = ProtobufUtil.toServerName(data);
+      currentServer = ProtobufUtil.parseServerNameFrom(data);
       i++;
     } while (!moveToServer.equals(currentServer) && i < max); //wait for 10 seconds overall
     assertNotEquals(max, i);
@@ -353,4 +361,67 @@ public class TestMetaWithReplicas {
       assertNotEquals(3, i);
     }
   }
+
+  @Test
+  public void testFailedReplicaAssigment() throws InterruptedException, IOException {
+    //using our rigged master, to force a failed meta replica assignment
+    TEST_UTIL.getMiniHBaseCluster().getConfiguration().setClass(HConstants.MASTER_IMPL, BrokenMetaReplicaMaster.class, HMaster.class);
+    TEST_UTIL.getMiniHBaseCluster().stopMaster(0).join();
+    HMaster newMaster = TEST_UTIL.getMiniHBaseCluster().startMaster().getMaster();
+    //waiting for master to come up
+    TEST_UTIL.waitFor(30000, () -> newMaster.isInitialized());
+    TEST_UTIL.getMiniHBaseCluster().getConfiguration().unset(HConstants.MASTER_IMPL);
+
+
+    AssignmentManager am = newMaster.getAssignmentManager();
+    //showing one of the replicas got assigned
+    RegionInfo metaReplicaHri = RegionReplicaUtil.getRegionInfoForReplica(
+      RegionInfoBuilder.FIRST_META_REGIONINFO, 1);
+    RegionStateNode metaReplicaRegionNode = am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri);
+    Assert.assertNotNull(metaReplicaRegionNode.getRegionLocation());
+    //showing one of the replicas failed to be assigned
+    RegionInfo metaReplicaHri2 = RegionReplicaUtil.getRegionInfoForReplica(
+      RegionInfoBuilder.FIRST_META_REGIONINFO, 2);
+    RegionStateNode metaReplicaRegionNode2 = am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri2);
+    Assert.assertNull(metaReplicaRegionNode2.getRegionLocation());
+
+    //showing master is active and running
+    Assert.assertFalse(newMaster.isStopping());
+    Assert.assertFalse(newMaster.isStopped());
+    Assert.assertTrue(newMaster.isActiveMaster());
+  }
+
+  public static class BrokenTransitRegionStateProcedure extends TransitRegionStateProcedure {
+    protected BrokenTransitRegionStateProcedure() {
+      //super(env, hri, assignCandidate, forceNewPlan, type);
+      super(null, null, null, false,TransitionType.ASSIGN);
+    }
+  }
+
+  public static class BrokenMetaReplicaMaster extends HMaster{
+    public BrokenMetaReplicaMaster(final Configuration conf) throws IOException {
+      super(conf);
+    }
+
+    @Override
+    public AssignmentManager createAssignmentManager(MasterServices master) {
+      return new BrokenMasterMetaAssignmentManager(master);
+    }
+  }
+
+  public static class BrokenMasterMetaAssignmentManager extends AssignmentManager{
+    MasterServices master;
+    public BrokenMasterMetaAssignmentManager(final MasterServices master) {
+      super(master);
+      this.master = master;
+    }
+
+    public Future<byte[]> assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException {
+      RegionStateNode regionNode = getRegionStates().getOrCreateRegionStateNode(regionInfo);
+      if (regionNode.getRegionInfo().getReplicaId() == 2) {
+        regionNode.setProcedure(new BrokenTransitRegionStateProcedure());
+      }
+      return super.assignAsync(regionInfo, sn);
+    }
+  }
 }