You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2020/05/14 20:09:26 UTC

[hbase] branch master updated: HBASE-24368 Let HBCKSCP clear 'Unknown Servers', even if RegionStateNode has RegionLocation == null

This is an automated email from the ASF dual-hosted git repository.

stack pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/master by this push:
     new 32e2682  HBASE-24368 Let HBCKSCP clear 'Unknown Servers', even if RegionStateNode has RegionLocation == null
32e2682 is described below

commit 32e2682310e725526ca3417757968d452bb5775b
Author: stack <st...@apache.org>
AuthorDate: Wed May 13 22:19:25 2020 -0700

    HBASE-24368 Let HBCKSCP clear 'Unknown Servers', even if RegionStateNode has RegionLocation == null
    
    hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java
     Edit a log.
    
    hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
     Add override of isMatchingRegionLocation. Allow 'null' as a pass in
     HBCKSCP.
    
    hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
     Add a method for HBCKSCP to override and be less strict filtering
     assigns.
    
    hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
     Some doc on what 'Unknown Servers' are.
---
 .../GCMultipleMergedRegionsProcedure.java          |  5 ++---
 .../master/procedure/HBCKServerCrashProcedure.java | 13 +++++++++++
 .../master/procedure/ServerCrashProcedure.java     | 26 ++++++++++++++++------
 .../main/resources/hbase-webapps/master/hbck.jsp   | 23 +++++++++++++++----
 4 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java
index 4fc5484..71fcd35 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java
@@ -99,12 +99,11 @@ public class GCMultipleMergedRegionsProcedure extends
         case GC_MERGED_REGIONS_PREPARE:
           // If GCMultipleMergedRegionsProcedure processing is slower than the CatalogJanitor's scan
           // interval, it will end resubmitting GCMultipleMergedRegionsProcedure for the same
-          // region, we can skip duplicate GCMultipleMergedRegionsProcedure while previous finished
+          // region. We can skip duplicate GCMultipleMergedRegionsProcedure while previous finished
           List<RegionInfo> parents = MetaTableAccessor.getMergeRegions(
             env.getMasterServices().getConnection(), mergedChild.getRegionName());
           if (parents == null || parents.isEmpty()) {
-            LOG.info("Region=" + mergedChild.getShortNameToLog()
-                + " info:merge qualifier has been deleted");
+            LOG.info("{} mergeXXX qualifiers have ALL been deleted", mergedChild.getShortNameToLog());
             return Flow.NO_MORE_STATE;
           }
           setNextState(GCMergedRegionsState.GC_MERGED_REGIONS_PURGE);
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
index eec820c..a12b853 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.client.Connection;
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.master.RegionState;
+import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
 import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
@@ -168,4 +169,16 @@ public class HBCKServerCrashProcedure extends ServerCrashProcedure {
       return this.reassigns;
     }
   }
+
+  /**
+   * The RegionStateNode will not have a location if a confirm of an OPEN fails. On fail,
+   * the RegionStateNode regionLocation is set to null. This is 'looser' than the test done
+   * in the superclass. The HBCKSCP has been scheduled by an operator via hbck2 probably at the
+   * behest of a report of an 'Unknown Server' in the 'HBCK Report'. Let the operators operation
+   * succeed even in case where the region location in the RegionStateNode is null.
+   */
+  @Override
+  protected boolean isMatchingRegionLocation(RegionStateNode rsn) {
+    return super.isMatchingRegionLocation(rsn) || rsn.getRegionLocation() == null;
+  }
 }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
index 178343f..076c266 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@@ -451,6 +451,15 @@ public class ServerCrashProcedure
   }
 
   /**
+   * Moved out here so can be overridden by the HBCK fix-up SCP to be less strict about what
+   * it will tolerate as a 'match'.
+   * @return True if the region location in <code>rsn</code> matches that of this crashed server.
+   */
+  protected boolean isMatchingRegionLocation(RegionStateNode rsn) {
+    return this.serverName.equals(rsn.getRegionLocation());
+  }
+
+  /**
    * Assign the regions on the crashed RS to other Rses.
    * <p/>
    * In this method we will go through all the RegionStateNodes of the give regions to find out
@@ -467,14 +476,17 @@ public class ServerCrashProcedure
       regionNode.lock();
       try {
         // This is possible, as when a server is dead, TRSP will fail to schedule a RemoteProcedure
-        // to us and then try to assign the region to a new RS. And before it has updated the region
+        // and then try to assign the region to a new RS. And before it has updated the region
         // location to the new RS, we may have already called the am.getRegionsOnServer so we will
-        // consider the region is still on us. And then before we arrive here, the TRSP could have
-        // updated the region location, or even finished itself, so the region is no longer on us
-        // any more, we should not try to assign it again. Please see HBASE-23594 for more details.
-        if (!serverName.equals(regionNode.getRegionLocation())) {
-          LOG.info("{} found a region {} which is no longer on us {}, give up assigning...", this,
-            regionNode, serverName);
+        // consider the region is still on this crashed server. Then before we arrive here, the
+        // TRSP could have updated the region location, or even finished itself, so the region is
+        // no longer on this crashed server any more. We should not try to assign it again. Please
+        // see HBASE-23594 for more details.
+        // UPDATE: HBCKServerCrashProcedure overrides isMatchingRegionLocation; this check can get
+        // in the way of our clearing out 'Unknown Servers'.
+        if (!isMatchingRegionLocation(regionNode)) {
+          LOG.info("{} found {} whose regionLocation no longer matches {}, skipping assign...",
+            this, regionNode, serverName);
           continue;
         }
         if (regionNode.getProcedure() != null) {
diff --git a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
index d90827c..f0a2ce1 100644
--- a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
+++ b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
@@ -112,8 +112,7 @@
         need to check the server still exists. If not, schedule <em>ServerCrashProcedure</em> for it. If exists,
         restart Server2 and Server1):
         3. More than one regionserver reports opened this region (Fix: restart the RegionServers).
-        Notice: the reported online regionservers may be not right when there are regions in transition.
-        Please check them in regionserver's web UI.
+        Note: the reported online regionservers may be not be up-to-date when there are regions in transition.
         </span>
       </p>
 
@@ -165,8 +164,9 @@
   </div>
       <p>
         <span>
-          The below are Regions we've lost account of. To be safe, run bulk load of any data found in these Region orphan directories back into the HBase cluster.
-          First make sure <em>hbase:meta</em> is in a healthy state, that there are no holes, overlaps or inconsistencies (else bulk load may complain);
+          The below are Regions we've lost account of. To be safe, run bulk load of any data found under these Region orphan directories to have the
+          cluster re-adopt data.
+          First make sure <em>hbase:meta</em> is in a healthy state, that there are no holes, overlaps or inconsistencies (else bulk load may fail);
           run <em>hbck2 fixMeta</em>. Once this is done, per Region below, run a bulk
           load -- <em>$ hbase completebulkload REGION_DIR_PATH TABLE_NAME</em> -- and then delete the desiccated directory content (HFiles are removed upon
           successful load; all that is left are empty directories and occasionally a seqid marking file).
@@ -259,6 +259,21 @@
                 <h2>Unknown Servers</h2>
               </div>
             </div>
+            <p>
+              <span>The below are servers mentioned in the hbase:meta table that are no longer 'live' or known 'dead'.
+                The server likely belongs to an older cluster epoch since replaced by a new instance because of a restart/crash.
+                To clear 'Unknown Servers', run 'hbck2 scheduleRecoveries UNKNOWN_SERVERNAME'. This will schedule a ServerCrashProcedure.
+                It will clear out 'Unknown Server' references and schedule reassigns of any Regions that were associated with this host.
+                But first!, be sure the referenced Region is not currently stuck looping trying to OPEN. Does it show as a Region-In-Transition on the
+                Master home page? Is it mentioned in the 'Procedures and Locks' Procedures list? If so, perhaps it stuck in a loop
+                trying to OPEN but unable to because of a missing reference or file.
+                Read the Master log looking for the most recent
+                mentions of the associated Region name. Try and address any such complaint first. If successful, a side-effect
+                should be the clean up of the 'Unknown Servers' list. It may take a while. OPENs are retried forever but the interval
+                between retries grows. The 'Unknown Server' may be cleared because it is just the last RegionServer the Region was
+                successfully opened on; on the next open, the 'Unknown Server' will be purged.
+              </span>
+            </p>
             <table class="table table-striped">
               <tr>
                 <th>RegionInfo</th>