You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2019/05/20 16:24:26 UTC

[hbase] branch branch-2.0 updated: HBASE-22289 WAL-based log splitting resubmit threshold may result in a task being stuck forever

This is an automated email from the ASF dual-hosted git repository.

stack pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 7f61881  HBASE-22289 WAL-based log splitting resubmit threshold may result in a task being stuck forever
7f61881 is described below

commit 7f61881197b6d27240e0cfd6f68c8ba0c8ef2657
Author: Sergey Soldatov <ss...@apache.org>
AuthorDate: Fri May 17 19:31:34 2019 -0700

    HBASE-22289 WAL-based log splitting resubmit threshold may result in a task being stuck forever
---
 .../coordination/ZkSplitLogWorkerCoordination.java |  4 +-
 .../hadoop/hbase/regionserver/SplitLogWorker.java  |  2 +
 .../regionserver/handler/WALSplitterHandler.java   | 45 +++++++++++++---------
 3 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZkSplitLogWorkerCoordination.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZkSplitLogWorkerCoordination.java
index 2143f80..fe5dc8e 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZkSplitLogWorkerCoordination.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZkSplitLogWorkerCoordination.java
@@ -572,7 +572,9 @@ public class ZkSplitLogWorkerCoordination extends ZKListener implements
     try {
       if (ZKUtil.setData(watcher, task, slt.toByteArray(), taskZKVersion)) {
         LOG.info("successfully transitioned task " + task + " to final state " + slt);
-        ctr.increment();
+        if (ctr != null) {
+          ctr.increment();
+        }
         return;
       }
       LOG.warn("failed to transistion task " + task + " to end state " + slt
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java
index a1c2030..d4549de 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java
@@ -96,6 +96,8 @@ public class SplitLogWorker implements Runnable {
         // TODO have to correctly figure out when log splitting has been
         // interrupted or has encountered a transient error and when it has
         // encountered a bad non-retry-able persistent error.
+        // Note: this can actually get the master stuck (HBASE-22289) so treat preempted as error.
+        //       splitLogFile does return false for legitimate retriable errors.
         try {
           if (!WALSplitter.splitLogFile(walDir, fs.getFileStatus(new Path(walDir, filename)),
             fs, conf, p, sequenceIdChecker,
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java
index 49ab574..f1bbfe7 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java
@@ -62,38 +62,47 @@ public class WALSplitterHandler extends EventHandler {
     this.splitTaskExecutor = splitTaskExecutor;
   }
 
+
+  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="SF_SWITCH_FALLTHROUGH",
+      justification="Intentional")
   @Override
   public void process() throws IOException {
     long startTime = System.currentTimeMillis();
     Status status = null;
     try {
       status = this.splitTaskExecutor.exec(splitTaskDetails.getWALFile(), reporter);
+      boolean wasCounterIncremented = false;
       switch (status) {
       case DONE:
         coordination.endTask(new SplitLogTask.Done(this.serverName),
           SplitLogCounters.tot_wkr_task_done, splitTaskDetails);
         break;
       case PREEMPTED:
-        SplitLogCounters.tot_wkr_preempt_task.increment();
-        LOG.warn("task execution preempted " + splitTaskDetails.getWALFile());
-        break;
+          SplitLogCounters.tot_wkr_preempt_task.increment();
+          wasCounterIncremented = true;
+          // Preempted state can currently be returned either when task is preempted, or when
+          // there's a particular kind of error (e.g. some ZK/HDFS errors, in my observation).
+          // In the latter case, master-side split task will get stuck if we don't update the
+          // status. Treat preemption as error to be on the safe side.
+          LOG.warn("task execution preempted; treating as error " + splitTaskDetails.getWALFile());
+          //$FALL-THROUGH$
       case ERR:
-        if (server != null && !server.isStopped()) {
-          coordination.endTask(new SplitLogTask.Err(this.serverName),
-            SplitLogCounters.tot_wkr_task_err, splitTaskDetails);
-          break;
-        }
-        // if the RS is exiting then there is probably a tons of stuff
-        // that can go wrong. Resign instead of signaling error.
-        //$FALL-THROUGH$
+          if (server != null && !server.isStopped()) {
+            coordination.endTask(new SplitLogTask.Err(this.serverName), wasCounterIncremented
+              ? null : SplitLogCounters.tot_wkr_task_err, splitTaskDetails);
+            break;
+          }
+          // if the RS is exiting then there is probably a tons of stuff
+          // that can go wrong. Resign instead of signaling error.
+          //$FALL-THROUGH$
       case RESIGNED:
-        if (server != null && server.isStopped()) {
-          LOG.info("task execution interrupted because worker is exiting "
-              + splitTaskDetails.toString());
-        }
-        coordination.endTask(new SplitLogTask.Resigned(this.serverName),
-          SplitLogCounters.tot_wkr_task_resigned, splitTaskDetails);
-        break;
+          if (server != null && server.isStopped()) {
+            LOG.info("task execution interrupted because worker is exiting "
+                + splitTaskDetails.toString());
+          }
+          coordination.endTask(new SplitLogTask.Resigned(this.serverName), wasCounterIncremented
+            ? null : SplitLogCounters.tot_wkr_task_resigned, splitTaskDetails);
+          break;
       }
     } finally {
       LOG.info("Worker " + serverName + " done with task " + splitTaskDetails.toString() + " in "