You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zh...@apache.org on 2021/04/16 05:41:00 UTC

[hbase] branch branch-2 updated: HBASE-25775 Use a special balancer to deal with maintenance mode (#3161)

This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2 by this push:
     new a177fb4  HBASE-25775 Use a special balancer to deal with maintenance mode (#3161)
a177fb4 is described below

commit a177fb4c62e3af40caa1046c52b1390c12022cc7
Author: Duo Zhang <zh...@apache.org>
AuthorDate: Fri Apr 16 09:50:24 2021 +0800

    HBASE-25775 Use a special balancer to deal with maintenance mode (#3161)
    
    Signed-off-by: Wellington Chevreuil <wc...@apache.org>
---
 .../org/apache/hadoop/hbase/master/HMaster.java    |  11 +-
 .../hbase/master/balancer/BaseLoadBalancer.java    |  51 ++++----
 .../master/balancer/MaintenanceLoadBalancer.java   | 131 +++++++++++++++++++++
 .../hadoop/hbase/master/TestMasterRepairMode.java  |  47 ++++----
 4 files changed, 192 insertions(+), 48 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index 1089adf..62d77b2 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -107,6 +107,7 @@ import org.apache.hadoop.hbase.master.balancer.BalancerChore;
 import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer;
 import org.apache.hadoop.hbase.master.balancer.ClusterStatusChore;
 import org.apache.hadoop.hbase.master.balancer.LoadBalancerFactory;
+import org.apache.hadoop.hbase.master.balancer.MaintenanceLoadBalancer;
 import org.apache.hadoop.hbase.master.cleaner.DirScanPool;
 import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
 import org.apache.hadoop.hbase.master.cleaner.LogCleaner;
@@ -670,9 +671,13 @@ public class HMaster extends HRegionServer implements MasterServices {
    * Initialize all ZK based system trackers. But do not include {@link RegionServerTracker}, it
    * should have already been initialized along with {@link ServerManager}.
    */
-  @InterfaceAudience.Private
-  protected void initializeZKBasedSystemTrackers()
-      throws IOException, InterruptedException, KeeperException, ReplicationException {
+  private void initializeZKBasedSystemTrackers()
+    throws IOException, KeeperException, ReplicationException {
+    if (maintenanceMode) {
+      // in maintenance mode, always use MaintenanceLoadBalancer.
+      conf.setClass(HConstants.HBASE_MASTER_LOADBALANCER_CLASS, MaintenanceLoadBalancer.class,
+        LoadBalancer.class);
+    }
     this.balancer = LoadBalancerFactory.getLoadBalancer(conf);
     this.loadBalancerTracker = new LoadBalancerTracker(zooKeeper, this);
     this.loadBalancerTracker.start();
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
index f10f455..d925a79 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
@@ -35,6 +35,7 @@ import java.util.NavigableMap;
 import java.util.Random;
 import java.util.Set;
 import java.util.TreeMap;
+import java.util.concurrent.ThreadLocalRandom;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
 import org.apache.commons.lang3.NotImplementedException;
@@ -1027,7 +1028,6 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
   protected float overallSlop;
   protected Configuration config = HBaseConfiguration.create();
   protected RackManager rackManager;
-  private static final Random RANDOM = new Random(System.currentTimeMillis());
   private static final Logger LOG = LoggerFactory.getLogger(BaseLoadBalancer.class);
   protected MetricsBalancer metricsBalancer = null;
   protected ClusterMetrics clusterStatus = null;
@@ -1041,17 +1041,21 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
   @Deprecated
   protected boolean onlySystemTablesOnMaster;
 
-  protected boolean maintenanceMode;
-
   @Override
   public void setConf(Configuration conf) {
     this.config = conf;
     setSlop(conf);
-    if (slop < 0) slop = 0;
-    else if (slop > 1) slop = 1;
+    if (slop < 0) {
+      slop = 0;
+    } else if (slop > 1) {
+      slop = 1;
+    }
 
-    if (overallSlop < 0) overallSlop = 0;
-    else if (overallSlop > 1) overallSlop = 1;
+    if (overallSlop < 0) {
+      overallSlop = 0;
+    } else if (overallSlop > 1) {
+      overallSlop = 1;
+    }
 
     this.onlySystemTablesOnMaster = LoadBalancer.isSystemTablesOnlyOnMaster(this.config);
 
@@ -1061,8 +1065,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
     }
     this.isByTable = conf.getBoolean(HConstants.HBASE_MASTER_LOADBALANCE_BYTABLE, isByTable);
     // Print out base configs. Don't print overallSlop since it for simple balancer exclusively.
-    LOG.info("slop={}, systemTablesOnMaster={}",
-        this.slop, this.onlySystemTablesOnMaster);
+    LOG.info("slop={}, systemTablesOnMaster={}", this.slop, this.onlySystemTablesOnMaster);
   }
 
   protected void setSlop(Configuration conf) {
@@ -1079,8 +1082,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
    */
   @Deprecated
   public boolean shouldBeOnMaster(RegionInfo region) {
-    return (this.maintenanceMode || this.onlySystemTablesOnMaster)
-        && region.getTable().isSystemTable();
+    return this.onlySystemTablesOnMaster && region.getTable().isSystemTable();
   }
 
   /**
@@ -1147,7 +1149,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
   protected Map<ServerName, List<RegionInfo>> assignMasterSystemRegions(
       Collection<RegionInfo> regions, List<ServerName> servers) {
     Map<ServerName, List<RegionInfo>> assignments = new TreeMap<>();
-    if (this.maintenanceMode || this.onlySystemTablesOnMaster) {
+    if (this.onlySystemTablesOnMaster) {
       if (masterServerName != null && servers.contains(masterServerName)) {
         assignments.put(masterServerName, new ArrayList<>());
         for (RegionInfo region : regions) {
@@ -1181,9 +1183,6 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
     if (useRegionFinder) {
       this.regionFinder.setServices(masterServices);
     }
-    if (this.services.isInMaintenanceMode()) {
-      this.maintenanceMode = true;
-    }
   }
 
   @Override
@@ -1298,7 +1297,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
     /**
      * only need assign system table
      */
-    if (this.maintenanceMode || regions.isEmpty()) {
+    if (regions.isEmpty()) {
       return assignments;
     }
 
@@ -1438,7 +1437,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
       regions = regions.entrySet().stream().filter(e -> !masterRegions.contains(e.getKey()))
           .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
     }
-    if (this.maintenanceMode || regions.isEmpty()) {
+    if (regions.isEmpty()) {
       return assignments;
     }
 
@@ -1585,8 +1584,9 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
     final int maxIterations = numServers * 4;
     int iterations = 0;
     List<ServerName> usedSNs = new ArrayList<>(servers.size());
+    Random rand = ThreadLocalRandom.current();
     do {
-      int i = RANDOM.nextInt(numServers);
+      int i = rand.nextInt(numServers);
       sn = servers.get(i);
       if (!usedSNs.contains(sn)) {
         usedSNs.add(sn);
@@ -1616,13 +1616,14 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
    */
   private void roundRobinAssignment(Cluster cluster, List<RegionInfo> regions,
     List<ServerName> servers, Map<ServerName, List<RegionInfo>> assignments) {
+    Random rand = ThreadLocalRandom.current();
     List<RegionInfo> unassignedRegions = new ArrayList<>();
     int numServers = servers.size();
     int numRegions = regions.size();
     int max = (int) Math.ceil((float) numRegions / numServers);
     int serverIdx = 0;
     if (numServers > 1) {
-      serverIdx = RANDOM.nextInt(numServers);
+      serverIdx = rand.nextInt(numServers);
     }
     int regionIdx = 0;
     for (int j = 0; j < numServers; j++) {
@@ -1644,17 +1645,17 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
 
     List<RegionInfo> lastFewRegions = new ArrayList<>();
     // assign the remaining by going through the list and try to assign to servers one-by-one
-    serverIdx = RANDOM.nextInt(numServers);
-    OUTER : for (RegionInfo region : unassignedRegions) {
+    serverIdx = rand.nextInt(numServers);
+    for (RegionInfo region : unassignedRegions) {
       boolean assigned = false;
-      INNER : for (int j = 0; j < numServers; j++) { // try all servers one by one
+      for (int j = 0; j < numServers; j++) { // try all servers one by one
         ServerName server = servers.get((j + serverIdx) % numServers);
         if (cluster.wouldLowerAvailability(region, server)) {
-          continue INNER;
+          continue;
         } else {
           assignments.computeIfAbsent(server, k -> new ArrayList<>()).add(region);
           cluster.doAssignRegion(region, server);
-          serverIdx = (j + serverIdx + 1) % numServers; //remain from next server
+          serverIdx = (j + serverIdx + 1) % numServers; // remain from next server
           assigned = true;
           break;
         }
@@ -1666,7 +1667,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
     // just sprinkle the rest of the regions on random regionservers. The balanceCluster will
     // make it optimal later. we can end up with this if numReplicas > numServers.
     for (RegionInfo region : lastFewRegions) {
-      int i = RANDOM.nextInt(numServers);
+      int i = rand.nextInt(numServers);
       ServerName server = servers.get(i);
       assignments.computeIfAbsent(server, k -> new ArrayList<>()).add(region);
       cluster.doAssignRegion(region, server);
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MaintenanceLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MaintenanceLoadBalancer.java
new file mode 100644
index 0000000..5c25272
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MaintenanceLoadBalancer.java
@@ -0,0 +1,131 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.hbase.ClusterMetrics;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.master.LoadBalancer;
+import org.apache.hadoop.hbase.master.MasterServices;
+import org.apache.hadoop.hbase.master.RegionPlan;
+import org.apache.yetus.audience.InterfaceAudience;
+
+/**
+ * a balancer which is only used in maintenance mode.
+ */
+@InterfaceAudience.Private
+public class MaintenanceLoadBalancer extends Configured implements LoadBalancer {
+
+  private volatile boolean stopped = false;
+
+  @Override
+  public void stop(String why) {
+    stopped = true;
+  }
+
+  @Override
+  public boolean isStopped() {
+    return stopped;
+  }
+
+  @Override
+  public void setClusterMetrics(ClusterMetrics st) {
+  }
+
+  @Override
+  public void setMasterServices(MasterServices masterServices) {
+  }
+
+  @Override
+  public List<RegionPlan> balanceCluster(
+    Map<TableName, Map<ServerName, List<RegionInfo>>> loadOfAllTable) throws IOException {
+    // do not need to balance in maintenance mode
+    return Collections.emptyList();
+  }
+
+  @Override
+  public List<RegionPlan> balanceTable(TableName tableName,
+    Map<ServerName, List<RegionInfo>> loadOfOneTable) {
+    return Collections.emptyList();
+  }
+
+  private Map<ServerName, List<RegionInfo>> assign(Collection<RegionInfo> regions,
+    List<ServerName> servers) {
+    // should only have 1 region server in maintenance mode
+    assert servers.size() == 1;
+    List<RegionInfo> systemRegions =
+      regions.stream().filter(r -> r.getTable().isSystemTable()).collect(Collectors.toList());
+    if (!systemRegions.isEmpty()) {
+      return Collections.singletonMap(servers.get(0), systemRegions);
+    } else {
+      return Collections.emptyMap();
+    }
+  }
+
+  @Override
+  public Map<ServerName, List<RegionInfo>> roundRobinAssignment(List<RegionInfo> regions,
+    List<ServerName> servers) {
+    return assign(regions, servers);
+  }
+
+  @Override
+  public Map<ServerName, List<RegionInfo>> retainAssignment(Map<RegionInfo, ServerName> regions,
+    List<ServerName> servers)  {
+    return assign(regions.keySet(), servers);
+  }
+
+  @Override
+  public ServerName randomAssignment(RegionInfo regionInfo, List<ServerName> servers) {
+    // should only have 1 region server in maintenance mode
+    assert servers.size() == 1;
+    return regionInfo.getTable().isSystemTable() ? servers.get(0) : null;
+  }
+
+  @Override
+  public void initialize() {
+  }
+
+  @Override
+  public void regionOnline(RegionInfo regionInfo, ServerName sn) {
+  }
+
+  @Override
+  public void regionOffline(RegionInfo regionInfo) {
+  }
+
+  @Override
+  public void onConfigurationChange(Configuration conf) {
+  }
+
+  @Override
+  public void postMasterStartupInitialize() {
+  }
+
+  @Override
+  public void updateBalancerStatus(boolean status) {
+  }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRepairMode.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRepairMode.java
index 6e85448..f506816 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRepairMode.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRepairMode.java
@@ -18,9 +18,11 @@
 package org.apache.hadoop.hbase.master;
 
 import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertThrows;
 import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
+
 import java.util.Arrays;
+import java.util.concurrent.TimeUnit;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
 import org.apache.hadoop.conf.Configuration;
@@ -29,7 +31,10 @@ import org.apache.hadoop.hbase.HBaseTestingUtility;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.StartMiniClusterOption;
 import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.AsyncConnection;
+import org.apache.hadoop.hbase.client.AsyncTable;
 import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.client.ConnectionFactory;
 import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.client.ResultScanner;
@@ -48,12 +53,12 @@ import org.junit.rules.TestName;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-@Category({MasterTests.class, LargeTests.class})
+@Category({ MasterTests.class, LargeTests.class })
 public class TestMasterRepairMode {
 
   @ClassRule
   public static final HBaseClassTestRule CLASS_RULE =
-      HBaseClassTestRule.forClass(TestMasterRepairMode.class);
+    HBaseClassTestRule.forClass(TestMasterRepairMode.class);
 
   @Rule
   public TestName name = new TestName();
@@ -84,16 +89,14 @@ public class TestMasterRepairMode {
   public void testNewCluster() throws Exception {
     enableMaintenanceMode();
 
-    TEST_UTIL.startMiniCluster(StartMiniClusterOption.builder()
-        .numRegionServers(0)
-        .numDataNodes(3)
-        .build());
+    TEST_UTIL.startMiniCluster(
+      StartMiniClusterOption.builder().numRegionServers(0).numDataNodes(3).build());
 
     Connection conn = TEST_UTIL.getConnection();
     assertTrue(conn.getAdmin().isMasterInMaintenanceMode());
 
     try (Table table = conn.getTable(TableName.META_TABLE_NAME);
-        ResultScanner scanner = table.getScanner(new Scan())) {
+      ResultScanner scanner = table.getScanner(new Scan())) {
       assertNotNull("Could not read meta.", scanner.next());
     }
   }
@@ -113,25 +116,29 @@ public class TestMasterRepairMode {
     LOG.info("Starting master-only");
 
     enableMaintenanceMode();
-    TEST_UTIL.startMiniHBaseCluster(StartMiniClusterOption.builder()
-        .numRegionServers(0).createRootDir(false).build());
+    TEST_UTIL.startMiniHBaseCluster(
+      StartMiniClusterOption.builder().numRegionServers(0).createRootDir(false).build());
 
     Connection conn = TEST_UTIL.getConnection();
     assertTrue(conn.getAdmin().isMasterInMaintenanceMode());
 
     try (Table table = conn.getTable(TableName.META_TABLE_NAME);
-        ResultScanner scanner = table.getScanner(HConstants.TABLE_FAMILY);
-        Stream<Result> results = StreamSupport.stream(scanner.spliterator(), false)) {
+      ResultScanner scanner = table.getScanner(HConstants.TABLE_FAMILY);
+      Stream<Result> results = StreamSupport.stream(scanner.spliterator(), false)) {
       assertTrue("Did not find user table records while reading hbase:meta",
-          results.anyMatch(r -> Arrays.equals(r.getRow(), testRepairMode.getName())));
+        results.anyMatch(r -> Arrays.equals(r.getRow(), testRepairMode.getName())));
     }
-
-    try (Table table = conn.getTable(testRepairMode);
-        ResultScanner scanner = table.getScanner(new Scan())) {
-      scanner.next();
-      fail("Should not be able to access user-space tables in repair mode.");
-    } catch (Exception e) {
-      // Expected
+    try (AsyncConnection asyncConn =
+      ConnectionFactory.createAsyncConnection(TEST_UTIL.getConfiguration()).get()) {
+      // use async table so we can set the timeout and retry value to let the operation fail fast
+      AsyncTable<?> table = asyncConn.getTableBuilder(testRepairMode)
+        .setScanTimeout(5, TimeUnit.SECONDS).setMaxRetries(2).build();
+      assertThrows("Should not be able to access user-space tables in repair mode.",
+        Exception.class, () -> {
+          try (ResultScanner scanner = table.getScanner(new Scan())) {
+            scanner.next();
+          }
+        });
     }
   }
 }