You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2019/11/19 15:33:24 UTC

[hbase] branch branch-2 updated: HBASE-23315 Miscellaneous HBCK Report page cleanup

This is an automated email from the ASF dual-hosted git repository.

stack pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2 by this push:
     new 70771b6  HBASE-23315 Miscellaneous HBCK Report page cleanup
70771b6 is described below

commit 70771b603ed125f69dc3ce58d6ee1db0fc35d220
Author: stack <st...@apache.org>
AuthorDate: Mon Nov 18 15:03:10 2019 -0800

    HBASE-23315 Miscellaneous HBCK Report page cleanup
    
     * Add a bit of javadoc around SerialReplicationChecker.
     * Miniscule edit to the profiler jsp page and then a bit of doc on how to make it work that might help.
     * Add some detail if NPE getting BitSetNode to help w/ debug.
     * Change HbckChore to log region names instead of encoded names; helps doing diagnostics; can take region name and query in shell to find out all about the region according to hbase:meta.
     * Add some fix-it help inline in the HBCK Report page – how to fix.
     * Add counts in procedures page so can see if making progress; move listing of WALs to end of the page.
---
 .../org/apache/hadoop/hbase/MetaTableAccessor.java |  6 +-
 .../apache/hadoop/hbase/http/ProfileServlet.java   |  4 +-
 .../hadoop/hbase/procedure2/store/BitSetNode.java  | 12 ++-
 .../org/apache/hadoop/hbase/master/HbckChore.java  | 22 ++---
 .../regionserver/SerialReplicationChecker.java     |  9 +-
 .../org/apache/hadoop/hbase/tool/CanaryTool.java   |  6 +-
 .../main/resources/hbase-webapps/master/hbck.jsp   | 14 +++-
 .../resources/hbase-webapps/master/procedures.jsp  | 98 +++++++++++++---------
 .../hbase/master/assignment/TestHbckChore.java     |  6 +-
 9 files changed, 110 insertions(+), 67 deletions(-)

diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
index 8d61f99..09a0f95 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
@@ -137,7 +137,7 @@ import org.apache.hbase.thirdparty.com.google.common.base.Throwables;
  *                             columns: info:merge0001, info:merge0002. You make also see 'mergeA',
  *                             and 'mergeB'. This is old form replaced by the new format that allows
  *                             for more than two parents to be merged at a time.
- * TODO: Add rep_barrier for serial replication explaination.
+ * TODO: Add rep_barrier for serial replication explaination. See SerialReplicationChecker.
  * </pre>
  * </p>
  * <p>
@@ -608,6 +608,7 @@ public class MetaTableAccessor {
    * @param excludeOfflinedSplitParents don't return split parents
    * @return Return list of regioninfos and server addresses.
    */
+  // What happens here when 1M regions in hbase:meta? This won't scale?
   public static List<Pair<RegionInfo, ServerName>> getTableRegionsAndLocations(
       Connection connection, @Nullable final TableName tableName,
       final boolean excludeOfflinedSplitParents) throws IOException {
@@ -1988,6 +1989,9 @@ public class MetaTableAccessor {
     return put;
   }
 
+  /**
+   * See class comment on SerialReplicationChecker
+   */
   public static void addReplicationBarrier(Put put, long openSeqNum) throws IOException {
     put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
       .setRow(put.getRow())
diff --git a/hbase-http/src/main/java/org/apache/hadoop/hbase/http/ProfileServlet.java b/hbase-http/src/main/java/org/apache/hadoop/hbase/http/ProfileServlet.java
index 642d05a..fc75530 100644
--- a/hbase-http/src/main/java/org/apache/hadoop/hbase/http/ProfileServlet.java
+++ b/hbase-http/src/main/java/org/apache/hadoop/hbase/http/ProfileServlet.java
@@ -270,7 +270,7 @@ public class ProfileServlet extends HttpServlet {
             resp.getWriter().write(
               "Started [" + event.getInternalName() +
               "] profiling. This page will automatically redirect to " +
-              relativeUrl + " after " + duration + " seconds.\n\ncommand:\n" +
+              relativeUrl + " after " + duration + " seconds.\n\nCommand:\n" +
               Joiner.on(" ").join(cmd));
 
             // to avoid auto-refresh by ProfileOutputServlet, refreshDelay can be specified
@@ -395,4 +395,4 @@ public class ProfileServlet extends HttpServlet {
 
   }
 
-}
\ No newline at end of file
+}
diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/BitSetNode.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/BitSetNode.java
index f42199b..78d2d91 100644
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/BitSetNode.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/BitSetNode.java
@@ -407,7 +407,15 @@ class BitSetNode {
     int wordIndex = bitmapIndex >> ADDRESS_BITS_PER_WORD;
     long value = (1L << bitmapIndex);
 
-    modified[wordIndex] |= value;
+    try {
+      modified[wordIndex] |= value;
+    } catch (ArrayIndexOutOfBoundsException aioobe) {
+      // We've gotten a AIOOBE in here; add detail to help debug.
+      ArrayIndexOutOfBoundsException aioobe2 =
+          new ArrayIndexOutOfBoundsException("pid=" + procId + ", deleted=" + isDeleted);
+      aioobe2.initCause(aioobe);
+      throw aioobe2;
+    }
     if (isDeleted) {
       deleted[wordIndex] |= value;
     } else {
@@ -431,4 +439,4 @@ class BitSetNode {
   private static long alignDown(final long x) {
     return x & -BITS_PER_WORD;
   }
-}
\ No newline at end of file
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java
index fc0111d..cf43685 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java
@@ -190,10 +190,10 @@ public class HbckChore extends ScheduledChore {
       RegionInfo regionInfo = regionState.getRegion();
       if (master.getTableStateManager()
           .isTableState(regionInfo.getTable(), TableState.State.DISABLED)) {
-        disabledTableRegions.add(regionInfo.getEncodedName());
+        disabledTableRegions.add(regionInfo.getRegionNameAsString());
       }
       if (regionInfo.isSplitParent()) {
-        splitParentRegions.add(regionInfo.getEncodedName());
+        splitParentRegions.add(regionInfo.getRegionNameAsString());
       }
       HbckRegionInfo.MetaEntry metaEntry =
           new HbckRegionInfo.MetaEntry(regionInfo, regionState.getServerName(),
@@ -212,7 +212,7 @@ public class HbckChore extends ScheduledChore {
         String encodedRegionName = RegionInfo.encodeRegionName(regionName);
         HbckRegionInfo hri = regionInfoMap.get(encodedRegionName);
         if (hri == null) {
-          orphanRegionsOnRS.put(encodedRegionName, serverName);
+          orphanRegionsOnRS.put(RegionInfo.getRegionNameAsString(regionName), serverName);
           continue;
         }
         hri.addServer(hri.getMetaEntry(), serverName);
@@ -223,7 +223,6 @@ public class HbckChore extends ScheduledChore {
         numRegions, rsReports.size(), orphanRegionsOnFS.size());
 
     for (Map.Entry<String, HbckRegionInfo> entry : regionInfoMap.entrySet()) {
-      String encodedRegionName = entry.getKey();
       HbckRegionInfo hri = entry.getValue();
       ServerName locationInMeta = hri.getMetaEntry().getRegionServer();
       if (hri.getDeployedOn().size() == 0) {
@@ -231,21 +230,24 @@ public class HbckChore extends ScheduledChore {
           continue;
         }
         // skip the offline region which belong to disabled table.
-        if (disabledTableRegions.contains(encodedRegionName)) {
+        if (disabledTableRegions.contains(hri.getRegionNameAsString())) {
           continue;
         }
         // skip the split parent regions
-        if (splitParentRegions.contains(encodedRegionName)) {
+        if (splitParentRegions.contains(hri.getRegionNameAsString())) {
           continue;
         }
         // Master thought this region opened, but no regionserver reported it.
-        inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, new LinkedList<>()));
+        inconsistentRegions.put(hri.getRegionNameAsString(),
+            new Pair<>(locationInMeta, new LinkedList<>()));
       } else if (hri.getDeployedOn().size() > 1) {
         // More than one regionserver reported opened this region
-        inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
+        inconsistentRegions.put(hri.getRegionNameAsString(),
+            new Pair<>(locationInMeta, hri.getDeployedOn()));
       } else if (!hri.getDeployedOn().get(0).equals(locationInMeta)) {
         // Master thought this region opened on Server1, but regionserver reported Server2
-        inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
+        inconsistentRegions.put(hri.getRegionNameAsString(),
+            new Pair<>(locationInMeta, hri.getDeployedOn()));
       }
     }
   }
@@ -339,4 +341,4 @@ public class HbckChore extends ScheduledChore {
   public long getCheckingEndTimestamp() {
     return this.checkingEndTimestamp;
   }
-}
\ No newline at end of file
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java
index 4b88050..321bbb4 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java
@@ -50,12 +50,13 @@ import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache;
  * </p>
  * <p>
  * We record all the open sequence number for a region in a special family in meta, which is called
- * 'barrier', so there will be a sequence of open sequence number (b1, b2, b3, ...). We call [bn,
- * bn+1) a range, and it is obvious that a region will always be on the same RS within a range.
+ * 'rep_barrier', so there will be a sequence of open sequence number (b1, b2, b3, ...). We call
+ * [bn, bn+1) a range, and it is obvious that a region will always be on the same RS within a
+ * range.
  * <p>
  * When split and merge, we will also record the parent for the generated region(s) in the special
- * family in meta. And also, we will write an extra 'open sequence number' for the parent region(s),
- * which is the max sequence id of the region plus one.
+ * family in meta. And also, we will write an extra 'open sequence number' for the parent
+ * region(s), which is the max sequence id of the region plus one.
  * </p>
  * </p>
  * <p>
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java
index 4f59cf3..af9b879 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java
@@ -277,13 +277,15 @@ public class CanaryTool implements Tool, Canary {
 
     public void publishReadFailure(ServerName serverName, RegionInfo region, Exception e) {
       incReadFailureCount();
-      LOG.error("Read from {} on {} failed", region.getRegionNameAsString(), serverName, e);
+      LOG.error("Read from {} on serverName={} failed",
+          region.getRegionNameAsString(), serverName, e);
     }
 
     public void publishReadFailure(ServerName serverName, RegionInfo region,
         ColumnFamilyDescriptor column, Exception e) {
       incReadFailureCount();
-      LOG.error("Read from {} on {} {} failed", region.getRegionNameAsString(), serverName,
+      LOG.error("Read from {} on serverName={}, columnFamily={} failed",
+          region.getRegionNameAsString(), serverName,
           column.getNameAsString(), e);
     }
 
diff --git a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
index 0ac6678..f89aac8 100644
--- a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
+++ b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
@@ -78,7 +78,7 @@
 
   <div class="row">
     <div class="page-header">
-  <p><span>This page displays two reports: the 'HBCK Chore Report' and the 'CatalogJanitor Consistency Issues' report. Only titles show if there are no problems to report. Note some conditions are transitory as regions migrate.</span></p>
+      <p><span>This page displays two reports: the 'HBCK Chore Report' and the 'CatalogJanitor Consistency Issues' report. Only titles show if there are no problems to report. Note some conditions are <em>transitory</em> as regions migrate.</span></p>
     </div>
   </div>
   <div class="row">
@@ -119,7 +119,7 @@
 
   <table class="table table-striped">
     <tr>
-      <th>Region Encoded Name</th>
+      <th>Region Name</th>
       <th>Location in META</th>
       <th>Reported Online RegionServers</th>
     </tr>
@@ -142,10 +142,18 @@
       <h2>Orphan Regions on RegionServer</h2>
     </div>
   </div>
+      <p>
+        <span>
+          The below are Regions we've lost account of. To be safe, run bulk load of any data found in these Region orphan directories back into the HBase cluster.
+          First make sure hbase:meta is in healthy state; run 'hbkc2 fixMeta' to be sure. Once this is done, per Region below, run a bulk
+          load -- '$ hbase completebulkload REGION_DIR_PATH TABLE_NAME' -- and then delete the desiccated directory content (HFiles are removed upon successful load; all that is left are empty directories
+          and occasionally a seqid marking file).
+        </span>
+      </p>
 
   <table class="table table-striped">
     <tr>
-      <th>Region Encoded Name</th>
+      <th>Region Name</th>
       <th>Reported Online RegionServer</th>
     </tr>
     <% for (Map.Entry<String, ServerName> entry : orphanRegionsOnRS.entrySet()) { %>
diff --git a/hbase-server/src/main/resources/hbase-webapps/master/procedures.jsp b/hbase-server/src/main/resources/hbase-webapps/master/procedures.jsp
index c918b30..ea252cf 100644
--- a/hbase-server/src/main/resources/hbase-webapps/master/procedures.jsp
+++ b/hbase-server/src/main/resources/hbase-webapps/master/procedures.jsp
@@ -81,11 +81,14 @@
         <th>Errors</th>
         <th>Parameters</th>
     </tr>
-    <% for (Procedure<?> proc : procedures) { 
+    <%
+      int displayCount = 0;
+      for (Procedure<?> proc : procedures) {
       // Don't show SUCCESS procedures.
       if (proc.isSuccess()) {
         continue;
       }
+      displayCount++;
     %>
       <tr>
         <td><%= proc.getProcId() %></td>
@@ -99,11 +102,65 @@
         <td><%= escapeXml(proc.toString()) %></td>
       </tr>
     <% } %>
+    <%
+    if (displayCount > 0) {
+    %>
+      <p><%= displayCount %> procedure(s).</p>
+    <%
+    }
+    %>
   </table>
 </div>
 <br />
 <div class="container-fluid content">
   <div class="row">
+      <div class="page-header">
+          <h1>Locks</h1>
+      </div>
+  </div>
+    <%
+    if (lockedResources.size() > 0) {
+    %>
+    <p><%= lockedResources.size() %> lock(s).</p>
+    <%
+    }
+    %>
+  <% for (LockedResource lockedResource : lockedResources) { %>
+    <h2><%= lockedResource.getResourceType() %>: <%= lockedResource.getResourceName() %></h2>
+    <%
+      switch (lockedResource.getLockType()) {
+      case EXCLUSIVE:
+    %>
+    <p>Lock type: EXCLUSIVE</p>
+    <p>Owner procedure: <%= escapeXml(lockedResource.getExclusiveLockOwnerProcedure().toStringDetails()) %></p>
+    <%
+        break;
+      case SHARED:
+    %>
+    <p>Lock type: SHARED</p>
+    <p>Number of shared locks: <%= lockedResource.getSharedLockCount() %></p>
+    <%
+        break;
+      }
+
+      List<Procedure<?>> waitingProcedures = lockedResource.getWaitingProcedures();
+
+      if (!waitingProcedures.isEmpty()) {
+    %>
+        <h3>Waiting procedures</h3>
+        <table class="table table-striped" width="90%" >
+        <% for (Procedure<?> proc : procedures) { %>
+         <tr>
+            <td><%= escapeXml(proc.toStringDetails()) %></td>
+          </tr>
+        <% } %>
+        </table>
+    <% } %>
+  <% } %>
+</div>
+<br />
+<div class="container-fluid content">
+  <div class="row">
     <div class="page-header">
       <h2>Procedure WAL State</h2>
     </div>
@@ -206,44 +263,5 @@
   </div>
 </div>
 <br />
-<div class="container-fluid content">
-  <div class="row">
-      <div class="page-header">
-          <h1>Locks</h1>
-      </div>
-  </div>
-  <% for (LockedResource lockedResource : lockedResources) { %>
-    <h2><%= lockedResource.getResourceType() %>: <%= lockedResource.getResourceName() %></h2>
-    <%
-      switch (lockedResource.getLockType()) {
-      case EXCLUSIVE:
-    %>
-    <p>Lock type: EXCLUSIVE</p>
-    <p>Owner procedure: <%= escapeXml(lockedResource.getExclusiveLockOwnerProcedure().toStringDetails()) %></p>
-    <%
-        break;
-      case SHARED:
-    %>
-    <p>Lock type: SHARED</p>
-    <p>Number of shared locks: <%= lockedResource.getSharedLockCount() %></p>
-    <%
-        break;
-      }
-
-      List<Procedure<?>> waitingProcedures = lockedResource.getWaitingProcedures();
-
-      if (!waitingProcedures.isEmpty()) {
-    %>
-        <h3>Waiting procedures</h3>
-        <table class="table table-striped" width="90%" >
-        <% for (Procedure<?> proc : procedures) { %>
-         <tr>
-            <td><%= escapeXml(proc.toStringDetails()) %></td>
-          </tr>
-        <% } %>
-        </table>
-    <% } %>
-  <% } %>
-</div>
 
 <jsp:include page="footer.jsp" />
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestHbckChore.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestHbckChore.java
index 4c0a194..ea70508 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestHbckChore.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestHbckChore.java
@@ -69,7 +69,7 @@ public class TestHbckChore extends TestAssignmentManagerBase {
   @Test
   public void testForMeta() {
     byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName();
-    String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getEncodedName();
+    String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString();
     List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
     assertEquals(NSERVERS, serverNames.size());
 
@@ -96,7 +96,7 @@ public class TestHbckChore extends TestAssignmentManagerBase {
   public void testForUserTable() throws Exception {
     TableName tableName = TableName.valueOf("testForUserTable");
     RegionInfo hri = createRegionInfo(tableName, 1);
-    String regionName = hri.getEncodedName();
+    String regionName = hri.getRegionNameAsString();
     rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
     Future<byte[]> future = submitProcedure(createAssignProcedure(hri));
     waitOnFuture(future);
@@ -154,7 +154,7 @@ public class TestHbckChore extends TestAssignmentManagerBase {
   public void testForDisabledTable() throws Exception {
     TableName tableName = TableName.valueOf("testForDisabledTable");
     RegionInfo hri = createRegionInfo(tableName, 1);
-    String regionName = hri.getEncodedName();
+    String regionName = hri.getRegionNameAsString();
     rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
     Future<byte[]> future = submitProcedure(createAssignProcedure(hri));
     waitOnFuture(future);