You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ap...@apache.org on 2019/11/16 02:03:30 UTC

[hbase] branch branch-1.4 updated: HBASE-23261 Region stuck in transition while splitting

This is an automated email from the ASF dual-hosted git repository.

apurtell pushed a commit to branch branch-1.4
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-1.4 by this push:
     new 10df080  HBASE-23261 Region stuck in transition while splitting
10df080 is described below

commit 10df080a1975bf96ef0c59a7598bd6e7269bbaac
Author: Viraj Jasani <vi...@gmail.com>
AuthorDate: Tue Nov 12 01:04:12 2019 +0530

    HBASE-23261 Region stuck in transition while splitting
    
    Processing ZK BadVersionException during node transition
    
    Signed-off-by: Andrew Purtell <ap...@apache.org>
---
 .../org/apache/hadoop/hbase/zookeeper/ZKAssign.java  | 10 +++++++++-
 .../coordination/ZKSplitTransactionCoordination.java | 20 ++++++++++++++++++--
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java
index 297e96e..b2e1e1e 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java
@@ -868,7 +868,15 @@ public class ZKAssign {
     try {
       rt = RegionTransition.createRegionTransition(
           endState, region.getRegionName(), serverName, payload);
-      if(!ZKUtil.setData(zkw, node, rt.toByteArray(), stat.getVersion())) {
+      boolean isDataSet;
+      try {
+        isDataSet = ZKUtil.setData(zkw, node, rt.toByteArray(), stat.getVersion());
+      } catch (KeeperException.BadVersionException e) {
+        isDataSet = false;
+        LOG.error("Received BadVersionException from ZK for " + encoded
+          + ", version: " + stat.getVersion());
+      }
+      if (!isDataSet) {
         LOG.warn(zkw.prefix("Attempt to transition the " +
         "unassigned node for " + encoded +
         " from " + beginState + " to " + endState + " failed, " +
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitTransactionCoordination.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitTransactionCoordination.java
index f6e96fa..24164e5 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitTransactionCoordination.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitTransactionCoordination.java
@@ -40,6 +40,13 @@ public class ZKSplitTransactionCoordination implements SplitTransactionCoordinat
   private CoordinatedStateManager coordinationManager;
   private final ZooKeeperWatcher watcher;
 
+  // max wait for split transaction - 100 times in a loop with 100 ms of thread sleep each time
+  // this accounts for ~24 s due to calls involved in loop. even for busy cluster, by this time,
+  // we should have been able to complete setData() In fact, ideally, 2nd retry after failed
+  // attempt should be sufficient to retrieve correct ZK node version and successfully updating
+  // RIT info in ZK node.
+  private static final int SPIN_WAIT_TIMEOUT = 100;
+
   private static final Log LOG = LogFactory.getLog(ZKSplitTransactionCoordination.class);
 
   public ZKSplitTransactionCoordination(CoordinatedStateManager coordinationProvider,
@@ -163,6 +170,10 @@ public class ZKSplitTransactionCoordination implements SplitTransactionCoordinat
         }
         Thread.sleep(100);
         spins++;
+        if (spins > SPIN_WAIT_TIMEOUT) {
+          throw new IOException("Waiting time for Split Transaction exceeded for region: "
+            + parent.getRegionInfo().getRegionNameAsString());
+        }
         byte[] data = ZKAssign.getDataNoWatch(watcher, node, stat);
         if (data == null) {
           throw new IOException("Data is null, splitting node " + node + " no longer exists");
@@ -222,9 +233,14 @@ public class ZKSplitTransactionCoordination implements SplitTransactionCoordinat
     // Tell master about split by updating zk. If we fail, abort.
     if (coordinationManager.getServer() != null) {
       try {
-        zstd.setZnodeVersion(transitionSplittingNode(parent.getRegionInfo(), a.getRegionInfo(),
+        int newNodeVersion = transitionSplittingNode(parent.getRegionInfo(), a.getRegionInfo(),
           b.getRegionInfo(), coordinationManager.getServer().getServerName(), zstd,
-          RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT));
+          RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT);
+        if (newNodeVersion == -1) {
+          throw new IOException("Notifying master of RS split failed for region: "
+            + parent.getRegionInfo().getRegionNameAsString());
+        }
+        zstd.setZnodeVersion(newNodeVersion);
 
         int spins = 0;
         // Now wait for the master to process the split. We know it's done