You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zg...@apache.org on 2019/07/18 00:32:16 UTC

[hbase] 02/02: HBASE-22527 [hbck2] Add a master web ui to show the problematic regions

This is an automated email from the ASF dual-hosted git repository.

zghao pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git

commit 9e59b7ff5ec426d011f8f0b0ac23b3a997c89bf4
Author: Guanghao Zhang <zg...@apache.org>
AuthorDate: Thu Jul 11 15:20:34 2019 +0800

    HBASE-22527 [hbck2] Add a master web ui to show the problematic regions
---
 .../tmpl/master/AssignmentManagerStatusTmpl.jamon  | 100 ++++++++++++++--
 .../hbase/master/assignment/AssignmentManager.java |  58 +++++++++-
 .../assignment/TestAMProblematicRegions.java       | 127 +++++++++++++++++++++
 3 files changed, 273 insertions(+), 12 deletions(-)

diff --git a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon
index 9c6916e..90351aa 100644
--- a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon
+++ b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon
@@ -17,27 +17,105 @@ See the License for the specific language governing permissions and
 limitations under the License.
 </%doc>
 <%import>
-org.apache.hadoop.hbase.master.assignment.AssignmentManager;
-org.apache.hadoop.hbase.master.assignment.AssignmentManager.RegionInTransitionStat;
-org.apache.hadoop.hbase.master.assignment.RegionStates.RegionFailedOpen;
-org.apache.hadoop.hbase.master.RegionState;
+java.util.Map;
+java.util.Set;
+java.util.SortedSet;
+java.util.concurrent.atomic.AtomicInteger;
+java.util.stream.Collectors;
 org.apache.hadoop.conf.Configuration;
 org.apache.hadoop.hbase.HBaseConfiguration;
 org.apache.hadoop.hbase.HConstants;
+org.apache.hadoop.hbase.ServerName;
+org.apache.hadoop.hbase.client.RegionInfo;
 org.apache.hadoop.hbase.client.RegionInfoDisplay;
-java.util.HashSet;
-java.util.SortedSet;
-java.util.Map;
-java.util.concurrent.atomic.AtomicInteger;
+org.apache.hadoop.hbase.master.RegionState;
+org.apache.hadoop.hbase.master.assignment.AssignmentManager;
+org.apache.hadoop.hbase.master.assignment.AssignmentManager.RegionInTransitionStat;
+org.apache.hadoop.hbase.master.assignment.RegionStates.RegionFailedOpen;
+org.apache.hadoop.hbase.util.Pair;
 </%import>
 <%args>
 AssignmentManager assignmentManager;
 int limit = 100;
 </%args>
 
-<%java SortedSet<RegionState> rit = assignmentManager
-  .getRegionStates().getRegionsInTransitionOrderedByTimestamp();
-%>
+<%java>
+SortedSet<RegionState> rit = assignmentManager.getRegionStates()
+    .getRegionsInTransitionOrderedByTimestamp();
+Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = assignmentManager
+    .getProblematicRegions();
+</%java>
+
+<%if !problematicRegions.isEmpty() %>
+<%java>
+int totalSize = problematicRegions.size();
+int sizePerPage = Math.min(10, totalSize);
+int numOfPages = (int) Math.ceil(totalSize * 1.0 / sizePerPage);
+</%java>
+    <section>
+    <h2><a name="problem-regions">Problematic Regions</a></h2>
+    <p>
+        <span>
+            <% problematicRegions.size() %> problematic region(s). Notice: the reported online
+             regionservers may be not right when there are regions in transition. Please check them
+              in regionserver's web UI.
+        </span>
+    </p>
+    <div class="tabbable">
+        <div class="tab-content">
+        <%java int recordItr = 0; %>
+        <%for Map.Entry<String, Pair<ServerName, Set<ServerName>>> entry : problematicRegions.entrySet() %>
+            <%if (recordItr % sizePerPage) == 0 %>
+                <%if recordItr == 0 %>
+                    <div class="tab-pane active" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
+                <%else>
+                    <div class="tab-pane" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
+                </%if>
+                <table class="table table-striped" style="margin-bottom:0px;">
+                    <tr>
+                        <th>Region</th>
+                        <th>Location in META</th>
+                        <th>Reported Online Region Servers</th>
+                    </tr>
+            </%if>
+
+            <tr>
+                <td><% entry.getKey() %></td>
+                <td><% entry.getValue().getFirst() %></td>
+                <td><% entry.getValue().getSecond().stream().map(ServerName::getServerName)
+                    .collect(Collectors.joining(", ")) %></td>
+            </tr>
+            <%java recordItr++; %>
+            <%if (recordItr % sizePerPage) == 0 %>
+                </table>
+                </div>
+            </%if>
+        </%for>
+
+        <%if (recordItr % sizePerPage) != 0 %>
+         <%for ; (recordItr % sizePerPage) != 0 ; recordItr++ %>
+            <tr><td colspan="3" style="height:61px"></td></tr>
+         </%for>
+         </table>
+         </div>
+        </%if>
+
+        </div>
+        <nav>
+         <ul class="nav nav-pills pagination">
+         <%for int i = 1 ; i <= numOfPages; i++ %>
+             <%if i == 1 %>
+             <li class="active">
+             <%else>
+             <li>
+             </%if>
+             <a href="#tab_prs<% i %>"><% i %></a></li>
+         </%for>
+         </ul>
+        </nav>
+    </div>
+    </section>
+</%if>
 
 <%if !rit.isEmpty() %>
 <%java>
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index 5ad3ba4..ea4a99f 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -158,6 +158,8 @@ public class AssignmentManager {
   private final RegionStates regionStates = new RegionStates();
   private final RegionStateStore regionStateStore;
 
+  private final Map<ServerName, Set<byte[]>> rsReports = new HashMap<>();
+
   private final boolean shouldAssignRegionsWithFavoredNodes;
   private final int assignDispatchWaitQueueMaxSize;
   private final int assignDispatchWaitMillis;
@@ -1065,13 +1067,18 @@ public class AssignmentManager {
     }
 
     ServerStateNode serverNode = regionStates.getOrCreateServer(serverName);
-
     synchronized (serverNode) {
       if (!serverNode.isInState(ServerState.ONLINE)) {
         LOG.warn("Got a report from a server result in state " + serverNode.getState());
         return;
       }
     }
+
+    // Track the regionserver reported online regions in memory.
+    synchronized (rsReports) {
+      rsReports.put(serverName, regionNames);
+    }
+
     if (regionNames.isEmpty()) {
       // nothing to do if we don't have regions
       LOG.trace("no online region found on {}", serverName);
@@ -2022,4 +2029,53 @@ public class AssignmentManager {
   MasterServices getMaster() {
     return master;
   }
+
+  /**
+   * Found the potentially problematic opened regions. There are three case:
+   * case 1. Master thought this region opened, but no regionserver reported it.
+   * case 2. Master thought this region opened on Server1, but regionserver reported Server2
+   * case 3. More than one regionservers reported opened this region
+   *
+   * @return the map of potentially problematic opened regions. Key is the region name. Value is
+   *         a pair of location in meta and the regionservers which reported opened this region.
+   */
+  public Map<String, Pair<ServerName, Set<ServerName>>> getProblematicRegions() {
+    Map<String, Set<ServerName>> reportedOnlineRegions = new HashMap<>();
+    synchronized (rsReports) {
+      for (Map.Entry<ServerName, Set<byte[]>> entry : rsReports.entrySet()) {
+        for (byte[] regionName : entry.getValue()) {
+          reportedOnlineRegions
+              .computeIfAbsent(RegionInfo.getRegionNameAsString(regionName), r -> new HashSet<>())
+              .add(entry.getKey());
+        }
+      }
+    }
+
+    Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = new HashMap<>();
+    List<RegionState> rits = regionStates.getRegionsStateInTransition();
+    for (RegionState regionState : regionStates.getRegionStates()) {
+      // Only consider the opened region and not in transition
+      if (!rits.contains(regionState) && regionState.isOpened()) {
+        String regionName = regionState.getRegion().getRegionNameAsString();
+        ServerName serverName = regionState.getServerName();
+        if (reportedOnlineRegions.containsKey(regionName)) {
+          Set<ServerName> reportedServers = reportedOnlineRegions.get(regionName);
+          if (reportedServers.contains(serverName)) {
+            if (reportedServers.size() > 1) {
+              // More than one regionserver reported opened this region
+              problematicRegions.put(regionName, new Pair<>(serverName, reportedServers));
+            }
+          } else {
+            // Master thought this region opened on Server1, but regionserver reported Server2
+            problematicRegions.put(regionName, new Pair<>(serverName, reportedServers));
+          }
+        } else {
+          // Master thought this region opened, but no regionserver reported it.
+          problematicRegions.put(regionName, new Pair<>(serverName, new HashSet<>()));
+        }
+      }
+    }
+
+    return problematicRegions;
+  }
 }
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAMProblematicRegions.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAMProblematicRegions.java
new file mode 100644
index 0000000..2c86a09
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAMProblematicRegions.java
@@ -0,0 +1,127 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.assignment;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.Future;
+
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.RegionInfoBuilder;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.Pair;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Category({ MasterTests.class, MediumTests.class })
+public class TestAMProblematicRegions extends TestAssignmentManagerBase {
+  private static final Logger LOG = LoggerFactory.getLogger(TestAMProblematicRegions.class);
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+      HBaseClassTestRule.forClass(TestAMProblematicRegions.class);
+
+  @Test
+  public void testForMeta() {
+    byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName();
+    String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString();
+    List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
+    assertEquals(NSERVERS, serverNames.size());
+
+    Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = am.getProblematicRegions();
+
+    // Test for case1: Master thought this region opened, but no regionserver reported it.
+    assertTrue(problematicRegions.containsKey(metaRegionName));
+    Pair<ServerName, Set<ServerName>> pair = problematicRegions.get(metaRegionName);
+    ServerName locationInMeta = pair.getFirst();
+    Set<ServerName> reportedRegionServers = pair.getSecond();
+    assertTrue(serverNames.contains(locationInMeta));
+    assertEquals(0, reportedRegionServers.size());
+
+    // Reported right region location. Then not in problematic regions.
+    am.reportOnlineRegions(locationInMeta, Collections.singleton(metaRegionNameAsBytes));
+    problematicRegions = am.getProblematicRegions();
+    assertFalse(problematicRegions.containsKey(metaRegionName));
+  }
+
+  @Test
+  public void testForUserTable() throws Exception {
+    TableName tableName = TableName.valueOf("testForUserTable");
+    RegionInfo hri = createRegionInfo(tableName, 1);
+    String regionName = hri.getRegionNameAsString();
+    rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
+    Future<byte[]> future = submitProcedure(createAssignProcedure(hri));
+    waitOnFuture(future);
+
+    List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
+    assertEquals(NSERVERS, serverNames.size());
+
+    // Test for case1: Master thought this region opened, but no regionserver reported it.
+    Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = am.getProblematicRegions();
+    assertTrue(problematicRegions.containsKey(regionName));
+    Pair<ServerName, Set<ServerName>> pair = problematicRegions.get(regionName);
+    ServerName locationInMeta = pair.getFirst();
+    Set<ServerName> reportedRegionServers = pair.getSecond();
+    assertTrue(serverNames.contains(locationInMeta));
+    assertEquals(0, reportedRegionServers.size());
+
+    // Test for case2: Master thought this region opened on Server1, but regionserver reported
+    // Server2
+    final ServerName tempLocationInMeta = locationInMeta;
+    final ServerName anotherServer =
+        serverNames.stream().filter(s -> !s.equals(tempLocationInMeta)).findFirst().get();
+    am.reportOnlineRegions(anotherServer, Collections.singleton(hri.getRegionName()));
+    problematicRegions = am.getProblematicRegions();
+    assertTrue(problematicRegions.containsKey(regionName));
+    pair = problematicRegions.get(regionName);
+    locationInMeta = pair.getFirst();
+    reportedRegionServers = pair.getSecond();
+    assertEquals(1, reportedRegionServers.size());
+    assertFalse(reportedRegionServers.contains(locationInMeta));
+    assertTrue(reportedRegionServers.contains(anotherServer));
+
+    // Test for case3: More than one regionservers reported opened this region.
+    am.reportOnlineRegions(locationInMeta, Collections.singleton(hri.getRegionName()));
+    problematicRegions = am.getProblematicRegions();
+    assertTrue(problematicRegions.containsKey(regionName));
+    pair = problematicRegions.get(regionName);
+    locationInMeta = pair.getFirst();
+    reportedRegionServers = pair.getSecond();
+    assertEquals(2, reportedRegionServers.size());
+    assertTrue(reportedRegionServers.contains(locationInMeta));
+    assertTrue(reportedRegionServers.contains(anotherServer));
+
+    // Reported right region location. Then not in problematic regions.
+    am.reportOnlineRegions(anotherServer, Collections.EMPTY_SET);
+    problematicRegions = am.getProblematicRegions();
+    assertFalse(problematicRegions.containsKey(regionName));
+  }
+}
\ No newline at end of file