You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zh...@apache.org on 2023/03/24 09:29:42 UTC

[hbase] branch branch-2.5 updated (e1b58290d1d -> 32f95c4b13f)

This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a change to branch branch-2.5
in repository https://gitbox.apache.org/repos/asf/hbase.git


    from e1b58290d1d HBASE-27744 Update compression dependencies (#5137)
     new 17deed95334 HBASE-26526 Introduce a timeout to shutdown of WAL (#3297)
     new 32f95c4b13f HBASE-26866 Shutdown WAL may abort region server (#4254)

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../hbase/regionserver/wal/AbstractFSWAL.java      | 73 ++++++++++++++++++----
 1 file changed, 62 insertions(+), 11 deletions(-)


[hbase] 01/02: HBASE-26526 Introduce a timeout to shutdown of WAL (#3297)

Posted by zh...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a commit to branch branch-2.5
in repository https://gitbox.apache.org/repos/asf/hbase.git

commit 17deed95334ba900fe5f2840225d7c48af18633c
Author: Xiaolin Ha <ha...@apache.org>
AuthorDate: Tue Dec 7 12:26:59 2021 +0800

    HBASE-26526 Introduce a timeout to shutdown of WAL (#3297)
    
    Signed-off-by: Andrew Purtell <ap...@apache.org>
    (cherry picked from commit ca3ba494cbc322b0824d2d755bcf4191c3a525ed)
---
 .../hbase/regionserver/wal/AbstractFSWAL.java      | 62 +++++++++++++++++-----
 1 file changed, 49 insertions(+), 13 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java
index 3d7678c37c6..4a788fc075a 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java
@@ -39,6 +39,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.OptionalLong;
 import java.util.Set;
+import java.util.concurrent.Callable;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentNavigableMap;
 import java.util.concurrent.ConcurrentSkipListMap;
@@ -46,7 +47,9 @@ import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
@@ -143,6 +146,9 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
   public static final String RING_BUFFER_SLOT_COUNT =
     "hbase.regionserver.wal.disruptor.event.count";
 
+  public static final String WAL_SHUTDOWN_WAIT_TIMEOUT_MS = "hbase.wal.shutdown.wait.timeout.ms";
+  public static final int DEFAULT_WAL_SHUTDOWN_WAIT_TIMEOUT_MS = 15 * 1000;
+
   /**
    * file system instance
    */
@@ -270,6 +276,9 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
   protected volatile boolean closed = false;
 
   protected final AtomicBoolean shutdown = new AtomicBoolean(false);
+
+  protected final long walShutdownTimeout;
+
   /**
    * WAL Comparator; it compares the timestamp (log filenum), present in the log file name. Throws
    * an IllegalArgumentException if used to compare paths from different wals.
@@ -321,8 +330,8 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
 
   protected final AtomicBoolean rollRequested = new AtomicBoolean(false);
 
-  private final ExecutorService logArchiveExecutor = Executors.newSingleThreadExecutor(
-    new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Log-Archiver-%d").build());
+  private final ExecutorService logArchiveOrShutdownExecutor = Executors.newSingleThreadExecutor(
+    new ThreadFactoryBuilder().setDaemon(true).setNameFormat("WAL-Archive-Or-Shutdown-%d").build());
 
   private final int archiveRetries;
 
@@ -479,7 +488,9 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
     this.syncFutureCache = new SyncFutureCache(conf);
     this.implClassName = getClass().getSimpleName();
     this.useHsync = conf.getBoolean(HRegion.WAL_HSYNC_CONF_KEY, HRegion.DEFAULT_WAL_HSYNC);
-    archiveRetries = this.conf.getInt("hbase.regionserver.logroll.archive.retries", 0);
+    archiveRetries = this.conf.getInt("hbase.regionserver.walroll.archive.retries", 0);
+    this.walShutdownTimeout =
+      conf.getLong(WAL_SHUTDOWN_WAIT_TIMEOUT_MS, DEFAULT_WAL_SHUTDOWN_WAIT_TIMEOUT_MS);
   }
 
   /**
@@ -710,7 +721,7 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
       final List<Pair<Path, Long>> localLogsToArchive = logsToArchive;
       // make it async
       for (Pair<Path, Long> log : localLogsToArchive) {
-        logArchiveExecutor.execute(() -> {
+        logArchiveOrShutdownExecutor.execute(() -> {
           archive(log);
         });
         this.walFile2Props.remove(log.getFirst());
@@ -924,17 +935,42 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
         i.logCloseRequested();
       }
     }
-    rollWriterLock.lock();
-    try {
-      doShutdown();
-      if (syncFutureCache != null) {
-        syncFutureCache.clear();
+
+    Future<Void> future = logArchiveOrShutdownExecutor.submit(new Callable<Void>() {
+      @Override
+      public Void call() throws Exception {
+        if (rollWriterLock.tryLock(walShutdownTimeout, TimeUnit.SECONDS)) {
+          try {
+            doShutdown();
+            if (syncFutureCache != null) {
+              syncFutureCache.clear();
+            }
+          } finally {
+            rollWriterLock.unlock();
+          }
+        } else {
+          throw new IOException("Waiting for rollWriterLock timeout");
+        }
+        return null;
       }
-      if (logArchiveExecutor != null) {
-        logArchiveExecutor.shutdownNow();
+    });
+    logArchiveOrShutdownExecutor.shutdown();
+
+    try {
+      future.get(walShutdownTimeout, TimeUnit.MILLISECONDS);
+    } catch (InterruptedException e) {
+      throw new InterruptedIOException("Interrupted when waiting for shutdown WAL");
+    } catch (TimeoutException e) {
+      throw new TimeoutIOException("We have waited " + walShutdownTimeout + "ms, but"
+        + " the shutdown of WAL doesn't complete! Please check the status of underlying "
+        + "filesystem or increase the wait time by the config \"" + WAL_SHUTDOWN_WAIT_TIMEOUT_MS
+        + "\"", e);
+    } catch (ExecutionException e) {
+      if (e.getCause() instanceof IOException) {
+        throw (IOException) e.getCause();
+      } else {
+        throw new IOException(e.getCause());
       }
-    } finally {
-      rollWriterLock.unlock();
     }
   }
 


[hbase] 02/02: HBASE-26866 Shutdown WAL may abort region server (#4254)

Posted by zh...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a commit to branch branch-2.5
in repository https://gitbox.apache.org/repos/asf/hbase.git

commit 32f95c4b13fa23d9be4cdd4056bb39fa5372cec6
Author: Duo Zhang <zh...@apache.org>
AuthorDate: Wed Mar 23 14:53:58 2022 +0800

    HBASE-26866 Shutdown WAL may abort region server (#4254)
    
    Signed-off-by: Xiaolin Ha <ha...@apache.org>
    (cherry picked from commit b67c16a7636958970d37bfcd775fd55e8de98177)
---
 .../hbase/regionserver/wal/AbstractFSWAL.java      | 25 +++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java
index 4a788fc075a..450b8f33e8f 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java
@@ -48,6 +48,8 @@ import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadPoolExecutor;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicBoolean;
@@ -330,8 +332,12 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
 
   protected final AtomicBoolean rollRequested = new AtomicBoolean(false);
 
-  private final ExecutorService logArchiveOrShutdownExecutor = Executors.newSingleThreadExecutor(
-    new ThreadFactoryBuilder().setDaemon(true).setNameFormat("WAL-Archive-Or-Shutdown-%d").build());
+  // Run in caller if we get reject execution exception, to avoid aborting region server when we get
+  // reject execution exception. Usually this should not happen but let's make it more robust.
+  private final ExecutorService logArchiveExecutor =
+    new ThreadPoolExecutor(1, 1, 1L, TimeUnit.MINUTES, new LinkedBlockingQueue<Runnable>(),
+      new ThreadFactoryBuilder().setDaemon(true).setNameFormat("WAL-Archive-%d").build(),
+      new ThreadPoolExecutor.CallerRunsPolicy());
 
   private final int archiveRetries;
 
@@ -721,7 +727,7 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
       final List<Pair<Path, Long>> localLogsToArchive = logsToArchive;
       // make it async
       for (Pair<Path, Long> log : localLogsToArchive) {
-        logArchiveOrShutdownExecutor.execute(() -> {
+        logArchiveExecutor.execute(() -> {
           archive(log);
         });
         this.walFile2Props.remove(log.getFirst());
@@ -936,7 +942,10 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
       }
     }
 
-    Future<Void> future = logArchiveOrShutdownExecutor.submit(new Callable<Void>() {
+    ExecutorService shutdownExecutor = Executors.newSingleThreadExecutor(
+      new ThreadFactoryBuilder().setDaemon(true).setNameFormat("WAL-Shutdown-%d").build());
+
+    Future<Void> future = shutdownExecutor.submit(new Callable<Void>() {
       @Override
       public Void call() throws Exception {
         if (rollWriterLock.tryLock(walShutdownTimeout, TimeUnit.SECONDS)) {
@@ -954,7 +963,7 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
         return null;
       }
     });
-    logArchiveOrShutdownExecutor.shutdown();
+    shutdownExecutor.shutdown();
 
     try {
       future.get(walShutdownTimeout, TimeUnit.MILLISECONDS);
@@ -971,6 +980,12 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
       } else {
         throw new IOException(e.getCause());
       }
+    } finally {
+      // in shutdown we may call cleanOldLogs so shutdown this executor in the end.
+      // In sync replication implementation, we may shutdown a WAL without shutting down the whole
+      // region server, if we shutdown this executor earlier we may get reject execution exception
+      // and abort the region server
+      logArchiveExecutor.shutdown();
     }
   }