You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zh...@apache.org on 2023/03/24 09:29:51 UTC

[hbase] branch branch-2 updated (dd3cdf13cf7 -> 3b24b3b9697)

This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a change to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git


    from dd3cdf13cf7 HBASE-27744 Update compression dependencies (#5137)
     new 62fc8c81a37 HBASE-26526 Introduce a timeout to shutdown of WAL (#3297)
     new 3b24b3b9697 HBASE-26866 Shutdown WAL may abort region server (#4254)

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../hbase/regionserver/wal/AbstractFSWAL.java      | 73 ++++++++++++++++++----
 1 file changed, 62 insertions(+), 11 deletions(-)


[hbase] 02/02: HBASE-26866 Shutdown WAL may abort region server (#4254)

Posted by zh...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git

commit 3b24b3b9697b8e5108af1a14c2a597904a141291
Author: Duo Zhang <zh...@apache.org>
AuthorDate: Wed Mar 23 14:53:58 2022 +0800

    HBASE-26866 Shutdown WAL may abort region server (#4254)
    
    Signed-off-by: Xiaolin Ha <ha...@apache.org>
    (cherry picked from commit b67c16a7636958970d37bfcd775fd55e8de98177)
---
 .../hbase/regionserver/wal/AbstractFSWAL.java      | 25 +++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java
index 7ff52d9adf8..d6f7138a975 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java
@@ -48,6 +48,8 @@ import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadPoolExecutor;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicBoolean;
@@ -331,8 +333,12 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
 
   protected final AtomicBoolean rollRequested = new AtomicBoolean(false);
 
-  private final ExecutorService logArchiveOrShutdownExecutor = Executors.newSingleThreadExecutor(
-    new ThreadFactoryBuilder().setDaemon(true).setNameFormat("WAL-Archive-Or-Shutdown-%d").build());
+  // Run in caller if we get reject execution exception, to avoid aborting region server when we get
+  // reject execution exception. Usually this should not happen but let's make it more robust.
+  private final ExecutorService logArchiveExecutor =
+    new ThreadPoolExecutor(1, 1, 1L, TimeUnit.MINUTES, new LinkedBlockingQueue<Runnable>(),
+      new ThreadFactoryBuilder().setDaemon(true).setNameFormat("WAL-Archive-%d").build(),
+      new ThreadPoolExecutor.CallerRunsPolicy());
 
   private final int archiveRetries;
 
@@ -722,7 +728,7 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
       final List<Pair<Path, Long>> localLogsToArchive = logsToArchive;
       // make it async
       for (Pair<Path, Long> log : localLogsToArchive) {
-        logArchiveOrShutdownExecutor.execute(() -> {
+        logArchiveExecutor.execute(() -> {
           archive(log);
         });
         this.walFile2Props.remove(log.getFirst());
@@ -937,7 +943,10 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
       }
     }
 
-    Future<Void> future = logArchiveOrShutdownExecutor.submit(new Callable<Void>() {
+    ExecutorService shutdownExecutor = Executors.newSingleThreadExecutor(
+      new ThreadFactoryBuilder().setDaemon(true).setNameFormat("WAL-Shutdown-%d").build());
+
+    Future<Void> future = shutdownExecutor.submit(new Callable<Void>() {
       @Override
       public Void call() throws Exception {
         if (rollWriterLock.tryLock(walShutdownTimeout, TimeUnit.SECONDS)) {
@@ -955,7 +964,7 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
         return null;
       }
     });
-    logArchiveOrShutdownExecutor.shutdown();
+    shutdownExecutor.shutdown();
 
     try {
       future.get(walShutdownTimeout, TimeUnit.MILLISECONDS);
@@ -972,6 +981,12 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
       } else {
         throw new IOException(e.getCause());
       }
+    } finally {
+      // in shutdown we may call cleanOldLogs so shutdown this executor in the end.
+      // In sync replication implementation, we may shutdown a WAL without shutting down the whole
+      // region server, if we shutdown this executor earlier we may get reject execution exception
+      // and abort the region server
+      logArchiveExecutor.shutdown();
     }
   }
 


[hbase] 01/02: HBASE-26526 Introduce a timeout to shutdown of WAL (#3297)

Posted by zh...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git

commit 62fc8c81a3747c5e0407973770ef32ffbc31953e
Author: Xiaolin Ha <ha...@apache.org>
AuthorDate: Tue Dec 7 12:26:59 2021 +0800

    HBASE-26526 Introduce a timeout to shutdown of WAL (#3297)
    
    Signed-off-by: Andrew Purtell <ap...@apache.org>
    (cherry picked from commit ca3ba494cbc322b0824d2d755bcf4191c3a525ed)
---
 .../hbase/regionserver/wal/AbstractFSWAL.java      | 62 +++++++++++++++++-----
 1 file changed, 49 insertions(+), 13 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java
index 24e8b9b26a9..7ff52d9adf8 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java
@@ -39,6 +39,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.OptionalLong;
 import java.util.Set;
+import java.util.concurrent.Callable;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentNavigableMap;
 import java.util.concurrent.ConcurrentSkipListMap;
@@ -46,7 +47,9 @@ import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
@@ -144,6 +147,9 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
   public static final String RING_BUFFER_SLOT_COUNT =
     "hbase.regionserver.wal.disruptor.event.count";
 
+  public static final String WAL_SHUTDOWN_WAIT_TIMEOUT_MS = "hbase.wal.shutdown.wait.timeout.ms";
+  public static final int DEFAULT_WAL_SHUTDOWN_WAIT_TIMEOUT_MS = 15 * 1000;
+
   /**
    * file system instance
    */
@@ -271,6 +277,9 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
   protected volatile boolean closed = false;
 
   protected final AtomicBoolean shutdown = new AtomicBoolean(false);
+
+  protected final long walShutdownTimeout;
+
   /**
    * WAL Comparator; it compares the timestamp (log filenum), present in the log file name. Throws
    * an IllegalArgumentException if used to compare paths from different wals.
@@ -322,8 +331,8 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
 
   protected final AtomicBoolean rollRequested = new AtomicBoolean(false);
 
-  private final ExecutorService logArchiveExecutor = Executors.newSingleThreadExecutor(
-    new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Log-Archiver-%d").build());
+  private final ExecutorService logArchiveOrShutdownExecutor = Executors.newSingleThreadExecutor(
+    new ThreadFactoryBuilder().setDaemon(true).setNameFormat("WAL-Archive-Or-Shutdown-%d").build());
 
   private final int archiveRetries;
 
@@ -480,7 +489,9 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
     this.syncFutureCache = new SyncFutureCache(conf);
     this.implClassName = getClass().getSimpleName();
     this.useHsync = conf.getBoolean(HRegion.WAL_HSYNC_CONF_KEY, HRegion.DEFAULT_WAL_HSYNC);
-    archiveRetries = this.conf.getInt("hbase.regionserver.logroll.archive.retries", 0);
+    archiveRetries = this.conf.getInt("hbase.regionserver.walroll.archive.retries", 0);
+    this.walShutdownTimeout =
+      conf.getLong(WAL_SHUTDOWN_WAIT_TIMEOUT_MS, DEFAULT_WAL_SHUTDOWN_WAIT_TIMEOUT_MS);
   }
 
   /**
@@ -711,7 +722,7 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
       final List<Pair<Path, Long>> localLogsToArchive = logsToArchive;
       // make it async
       for (Pair<Path, Long> log : localLogsToArchive) {
-        logArchiveExecutor.execute(() -> {
+        logArchiveOrShutdownExecutor.execute(() -> {
           archive(log);
         });
         this.walFile2Props.remove(log.getFirst());
@@ -925,17 +936,42 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
         i.logCloseRequested();
       }
     }
-    rollWriterLock.lock();
-    try {
-      doShutdown();
-      if (syncFutureCache != null) {
-        syncFutureCache.clear();
+
+    Future<Void> future = logArchiveOrShutdownExecutor.submit(new Callable<Void>() {
+      @Override
+      public Void call() throws Exception {
+        if (rollWriterLock.tryLock(walShutdownTimeout, TimeUnit.SECONDS)) {
+          try {
+            doShutdown();
+            if (syncFutureCache != null) {
+              syncFutureCache.clear();
+            }
+          } finally {
+            rollWriterLock.unlock();
+          }
+        } else {
+          throw new IOException("Waiting for rollWriterLock timeout");
+        }
+        return null;
       }
-      if (logArchiveExecutor != null) {
-        logArchiveExecutor.shutdownNow();
+    });
+    logArchiveOrShutdownExecutor.shutdown();
+
+    try {
+      future.get(walShutdownTimeout, TimeUnit.MILLISECONDS);
+    } catch (InterruptedException e) {
+      throw new InterruptedIOException("Interrupted when waiting for shutdown WAL");
+    } catch (TimeoutException e) {
+      throw new TimeoutIOException("We have waited " + walShutdownTimeout + "ms, but"
+        + " the shutdown of WAL doesn't complete! Please check the status of underlying "
+        + "filesystem or increase the wait time by the config \"" + WAL_SHUTDOWN_WAIT_TIMEOUT_MS
+        + "\"", e);
+    } catch (ExecutionException e) {
+      if (e.getCause() instanceof IOException) {
+        throw (IOException) e.getCause();
+      } else {
+        throw new IOException(e.getCause());
       }
-    } finally {
-      rollWriterLock.unlock();
     }
   }