You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2020/03/20 22:26:45 UTC

[hbase] branch branch-2.3 updated: HBASE-23984 [Flakey Tests] TestMasterAbortAndRSGotKilled fails in teardown (#1311)

This is an automated email from the ASF dual-hosted git repository.

stack pushed a commit to branch branch-2.3
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.3 by this push:
     new 16441fb  HBASE-23984 [Flakey Tests] TestMasterAbortAndRSGotKilled fails in teardown (#1311)
16441fb is described below

commit 16441fbd6854bea38491cffdd4f98fd28522d8ed
Author: Michael Stack <sa...@users.noreply.github.com>
AuthorDate: Fri Mar 20 15:25:06 2020 -0700

    HBASE-23984 [Flakey Tests] TestMasterAbortAndRSGotKilled fails in teardown (#1311)
    
    hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
     Change parameter name and add javadoc to make it more clear what the
     param actually is.
    
    hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java
     Move postOpenDeployTasks so if it fails to talk to the Master -- which
     can happen on cluster shutdown -- then we will do cleanup of state;
     without this the RS can get stuck and won't go down.
    
    hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java
     Add handleException so CRH looks more like UnassignRegionHandler and
     AssignRegionHandler around exception handling. Add a bit of doc on
     why CRH.
    
    hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java
     Right shift most of the body of process so can add in a finally
     that cleans up rs.getRegionsInTransitionInRS is on exception
     (otherwise outstanding entries can stop a RS going down on cluster
     shutdown)
    
    Signed-off-by: Nick Dimiduk <nd...@apache.org>
    Signed-off-by: Duo Zhang <zh...@apache.org>
---
 .../apache/hadoop/hbase/regionserver/HRegion.java  | 14 +++--
 .../hadoop/hbase/regionserver/HRegionServer.java   | 12 +++--
 .../regionserver/handler/AssignRegionHandler.java  |  8 ++-
 .../regionserver/handler/CloseRegionHandler.java   | 61 +++++++++++-----------
 .../handler/UnassignRegionHandler.java             | 17 +++---
 5 files changed, 64 insertions(+), 48 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java
index 72eb351..c58a591 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java
@@ -944,7 +944,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
           " initialization.");
       }
       if (LOG.isDebugEnabled()) {
-        LOG.debug("Region open journal:\n" + status.prettyPrintJournal());
+        LOG.debug("Region open journal for {}:\n{}", this.getRegionInfo().getEncodedName(),
+          status.prettyPrintJournal());
       }
       status.cleanup();
     }
@@ -1553,7 +1554,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
       }
     } finally {
       if (LOG.isDebugEnabled()) {
-        LOG.debug("Region close journal:\n" + status.prettyPrintJournal());
+        LOG.debug("Region close journal for {}:\n{}", this.getRegionInfo().getEncodedName(),
+          status.prettyPrintJournal());
       }
       status.cleanup();
     }
@@ -1755,7 +1757,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
         Closeables.close(this.metricsRegionWrapper, true);
       }
       status.markComplete("Closed");
-      LOG.info("Closed " + this);
+      LOG.info("Closed {}", this);
       return result;
     } finally {
       lock.writeLock().unlock();
@@ -2266,7 +2268,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
     } finally {
       if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
       if (status != null) {
-        LOG.debug("Compaction status journal:\n\t" + status.prettyPrintJournal());
+        LOG.debug("Compaction status journal for {}:\n\t{}", this.getRegionInfo().getEncodedName(),
+          status.prettyPrintJournal());
         status.cleanup();
       }
     }
@@ -2413,7 +2416,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
       }
     } finally {
       lock.readLock().unlock();
-      LOG.debug("Flush status journal:\n\t" + status.prettyPrintJournal());
+      LOG.debug("Flush status journal for {}:\n\t{}", this.getRegionInfo().getEncodedName(),
+        status.prettyPrintJournal());
       status.cleanup();
     }
   }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index fc0b693..0de2c17 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -53,6 +53,7 @@ import java.util.concurrent.ConcurrentSkipListMap;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.stream.Collectors;
 import javax.management.MalformedObjectNameException;
 import javax.servlet.http.HttpServlet;
 import org.apache.commons.lang3.RandomUtils;
@@ -1457,6 +1458,9 @@ public class HRegionServer extends HasThread implements
                 " because some regions failed closing");
           }
           break;
+        } else {
+          LOG.debug("Waiting on {}", this.regionsInTransitionInRS.keySet().stream().
+            map(e -> Bytes.toString(e)).collect(Collectors.joining(", ")));
         }
         if (sleep(200)) {
           interrupted = true;
@@ -3187,10 +3191,12 @@ public class HRegionServer extends HasThread implements
    *
    * @param encodedName Region to close
    * @param abort True if we are aborting
+   * @param destination Where the Region is being moved too... maybe null if unknown.
    * @return True if closed a region.
    * @throws NotServingRegionException if the region is not online
    */
-  protected boolean closeRegion(String encodedName, final boolean abort, final ServerName sn)
+  protected boolean closeRegion(String encodedName, final boolean abort,
+        final ServerName destination)
       throws NotServingRegionException {
     //Check for permissions to close.
     HRegion actualRegion = this.getRegion(encodedName);
@@ -3216,7 +3222,7 @@ public class HRegionServer extends HasThread implements
         // We're going to try to do a standard close then.
         LOG.warn("The opening for region " + encodedName + " was done before we could cancel it." +
             " Doing a standard close now");
-        return closeRegion(encodedName, abort, sn);
+        return closeRegion(encodedName, abort, destination);
       }
       // Let's get the region from the online region list again
       actualRegion = this.getRegion(encodedName);
@@ -3247,7 +3253,7 @@ public class HRegionServer extends HasThread implements
     if (hri.isMetaRegion()) {
       crh = new CloseMetaHandler(this, this, hri, abort);
     } else {
-      crh = new CloseRegionHandler(this, this, hri, abort, sn);
+      crh = new CloseRegionHandler(this, this, hri, abort, destination);
     }
     this.executorService.submit(crh);
     return true;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java
index 76e8f80..3474bf9 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -34,7 +34,6 @@ import org.apache.hadoop.hbase.util.RetryCounter;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
 
 /**
@@ -136,6 +135,8 @@ public class AssignRegionHandler extends EventHandler {
       cleanUpAndReportFailure(e);
       return;
     }
+    // From here on out, this is PONR. We can not revert back. The only way to address an
+    // exception from here on out is to abort the region server.
     rs.postOpenDeployTasks(new PostOpenDeployContext(region, openProcId, masterSystemTime));
     rs.addRegion(region);
     LOG.info("Opened {}", regionName);
@@ -156,6 +157,9 @@ public class AssignRegionHandler extends EventHandler {
   protected void handleException(Throwable t) {
     LOG.warn("Fatal error occurred while opening region {}, aborting...",
       regionInfo.getRegionNameAsString(), t);
+    // Clear any reference in getServer().getRegionsInTransitionInRS() otherwise can hold up
+    // regionserver abort on cluster shutdown. HBASE-23984.
+    getServer().getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes());
     getServer().abort(
       "Failed to open region " + regionInfo.getRegionNameAsString() + " and can not recover", t);
   }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java
index d4ea004..6f43042 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java
@@ -1,4 +1,4 @@
-/**
+/*
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -19,7 +19,6 @@
 package org.apache.hadoop.hbase.regionserver.handler;
 
 import java.io.IOException;
-
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.Server;
 import org.apache.hadoop.hbase.ServerName;
@@ -38,9 +37,13 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProto
 /**
  * Handles closing of a region on a region server.
  * <p/>
- * Now for regular close region request, we will use {@link UnassignRegionHandler} instead. But when
- * shutting down the region server, will also close regions and the related methods still use this
- * class so we keep it here.
+ * In normal operation, we use {@link UnassignRegionHandler} closing Regions but when shutting down
+ * the region server and closing out Regions, we use this handler instead; it does not expect to
+ * be able to communicate the close back to the Master.
+ * <p>Expects that the close *has* been registered in the hosting RegionServer before
+ * submitting this Handler; i.e. <code>rss.getRegionsInTransitionInRS().putIfAbsent(
+ * this.regionInfo.getEncodedNameAsBytes(), Boolean.FALSE);</code> has been called first.
+ * In here when done, we do the deregister.</p>
  * @see UnassignRegionHandler
  */
 @InterfaceAudience.Private
@@ -62,11 +65,7 @@ public class CloseRegionHandler extends EventHandler {
 
   /**
    * This method used internally by the RegionServer to close out regions.
-   * @param server
-   * @param rsServices
-   * @param regionInfo
    * @param abort If the regionserver is aborting.
-   * @param destination
    */
   public CloseRegionHandler(final Server server,
       final RegionServerServices rsServices,
@@ -92,13 +91,13 @@ public class CloseRegionHandler extends EventHandler {
   }
 
   @Override
-  public void process() {
+  public void process() throws IOException {
+    String name = regionInfo.getEncodedName();
+    LOG.trace("Processing close of {}", name);
+    String encodedRegionName = regionInfo.getEncodedName();
+    // Check that this region is being served here
+    HRegion region = (HRegion)rsServices.getRegion(encodedRegionName);
     try {
-      String name = regionInfo.getEncodedName();
-      LOG.trace("Processing close of {}", name);
-      String encodedRegionName = regionInfo.getEncodedName();
-      // Check that this region is being served here
-      HRegion region = (HRegion)rsServices.getRegion(encodedRegionName);
       if (region == null) {
         LOG.warn("Received CLOSE for region {} but currently not serving - ignoring", name);
         // TODO: do better than a simple warning
@@ -106,20 +105,11 @@ public class CloseRegionHandler extends EventHandler {
       }
 
       // Close the region
-      try {
-        if (region.close(abort) == null) {
-          // This region got closed.  Most likely due to a split.
-          // The split message will clean up the master state.
-          LOG.warn("Can't close region {}, was already closed during close()", name);
-          return;
-        }
-      } catch (IOException ioe) {
-        // An IOException here indicates that we couldn't successfully flush the
-        // memstore before closing. So, we need to abort the server and allow
-        // the master to split our logs in order to recover the data.
-        server.abort("Unrecoverable exception while closing region " +
-          regionInfo.getRegionNameAsString() + ", still finishing close", ioe);
-        throw new RuntimeException(ioe);
+      if (region.close(abort) == null) {
+        // This region has already been closed. Should not happen (A unit test makes this
+        // happen as a side effect, TestRegionObserverInterface.testPreWALAppendNotCalledOnMetaEdit)
+        LOG.warn("Can't close {}; already closed", name);
+        return;
       }
 
       this.rsServices.removeRegion(region, destination);
@@ -127,10 +117,19 @@ public class CloseRegionHandler extends EventHandler {
         HConstants.NO_SEQNUM, Procedure.NO_PROC_ID, -1, regionInfo));
 
       // Done!  Region is closed on this RS
-      LOG.debug("Closed " + region.getRegionInfo().getRegionNameAsString());
-    } finally {
       this.rsServices.getRegionsInTransitionInRS().
         remove(this.regionInfo.getEncodedNameAsBytes(), Boolean.FALSE);
+      LOG.debug("Closed {}" + region.getRegionInfo().getRegionNameAsString());
+    } finally {
+      // Clear any reference in getServer().getRegionsInTransitionInRS() on success or failure,
+      // since a reference was added before this CRH was invoked. If we don't clear it, it can
+      // hold up regionserver abort on cluster shutdown. HBASE-23984.
+      this.rsServices.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes());
     }
   }
+
+  @Override protected void handleException(Throwable t) {
+    server.abort("Unrecoverable exception while closing " +
+      this.regionInfo.getRegionNameAsString(), t);
+  }
 }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java
index 0fc8edc..8b275d0 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java
@@ -33,7 +33,6 @@ import org.apache.hadoop.hbase.util.RetryCounter;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
 
 /**
@@ -106,10 +105,10 @@ public class UnassignRegionHandler extends EventHandler {
     LOG.info("Close {}", regionName);
     if (region.getCoprocessorHost() != null) {
       // XXX: The behavior is a bit broken. At master side there is no FAILED_CLOSE state, so if
-      // there are exception thrown from the CP, we can not report the error to master, and if here
-      // we just return without calling reportRegionStateTransition, the TRSP at master side will
-      // hang there for ever. So here if the CP throws an exception out, the only way is to abort
-      // the RS...
+      // there are exception thrown from the CP, we can not report the error to master, and if
+      // here we just return without calling reportRegionStateTransition, the TRSP at master side
+      // will hang there for ever. So here if the CP throws an exception out, the only way is to
+      // abort the RS...
       region.getCoprocessorHost().preClose(abort);
     }
     if (region.close(abort) == null) {
@@ -120,8 +119,9 @@ public class UnassignRegionHandler extends EventHandler {
       return;
     }
     rs.removeRegion(region, destination);
-    if (!rs.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.CLOSED,
-      HConstants.NO_SEQNUM, closeProcId, -1, region.getRegionInfo()))) {
+    if (!rs.reportRegionStateTransition(
+      new RegionStateTransitionContext(TransitionCode.CLOSED, HConstants.NO_SEQNUM, closeProcId,
+        -1, region.getRegionInfo()))) {
       throw new IOException("Failed to report close to master: " + regionName);
     }
     // Cache the close region procedure id after report region transition succeed.
@@ -133,6 +133,9 @@ public class UnassignRegionHandler extends EventHandler {
   @Override
   protected void handleException(Throwable t) {
     LOG.warn("Fatal error occurred while closing region {}, aborting...", encodedName, t);
+    // Clear any reference in getServer().getRegionsInTransitionInRS() otherwise can hold up
+    // regionserver abort on cluster shutdown. HBASE-23984.
+    getServer().getRegionsInTransitionInRS().remove(Bytes.toBytes(this.encodedName));
     getServer().abort("Failed to close region " + encodedName + " and can not recover", t);
   }