You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ap...@apache.org on 2019/11/16 02:03:30 UTC
[hbase] branch branch-1.4 updated: HBASE-23261 Region stuck in
transition while splitting
This is an automated email from the ASF dual-hosted git repository.
apurtell pushed a commit to branch branch-1.4
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-1.4 by this push:
new 10df080 HBASE-23261 Region stuck in transition while splitting
10df080 is described below
commit 10df080a1975bf96ef0c59a7598bd6e7269bbaac
Author: Viraj Jasani <vi...@gmail.com>
AuthorDate: Tue Nov 12 01:04:12 2019 +0530
HBASE-23261 Region stuck in transition while splitting
Processing ZK BadVersionException during node transition
Signed-off-by: Andrew Purtell <ap...@apache.org>
---
.../org/apache/hadoop/hbase/zookeeper/ZKAssign.java | 10 +++++++++-
.../coordination/ZKSplitTransactionCoordination.java | 20 ++++++++++++++++++--
2 files changed, 27 insertions(+), 3 deletions(-)
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java
index 297e96e..b2e1e1e 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java
@@ -868,7 +868,15 @@ public class ZKAssign {
try {
rt = RegionTransition.createRegionTransition(
endState, region.getRegionName(), serverName, payload);
- if(!ZKUtil.setData(zkw, node, rt.toByteArray(), stat.getVersion())) {
+ boolean isDataSet;
+ try {
+ isDataSet = ZKUtil.setData(zkw, node, rt.toByteArray(), stat.getVersion());
+ } catch (KeeperException.BadVersionException e) {
+ isDataSet = false;
+ LOG.error("Received BadVersionException from ZK for " + encoded
+ + ", version: " + stat.getVersion());
+ }
+ if (!isDataSet) {
LOG.warn(zkw.prefix("Attempt to transition the " +
"unassigned node for " + encoded +
" from " + beginState + " to " + endState + " failed, " +
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitTransactionCoordination.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitTransactionCoordination.java
index f6e96fa..24164e5 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitTransactionCoordination.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitTransactionCoordination.java
@@ -40,6 +40,13 @@ public class ZKSplitTransactionCoordination implements SplitTransactionCoordinat
private CoordinatedStateManager coordinationManager;
private final ZooKeeperWatcher watcher;
+ // max wait for split transaction - 100 times in a loop with 100 ms of thread sleep each time
+ // this accounts for ~24 s due to calls involved in loop. even for busy cluster, by this time,
+ // we should have been able to complete setData() In fact, ideally, 2nd retry after failed
+ // attempt should be sufficient to retrieve correct ZK node version and successfully updating
+ // RIT info in ZK node.
+ private static final int SPIN_WAIT_TIMEOUT = 100;
+
private static final Log LOG = LogFactory.getLog(ZKSplitTransactionCoordination.class);
public ZKSplitTransactionCoordination(CoordinatedStateManager coordinationProvider,
@@ -163,6 +170,10 @@ public class ZKSplitTransactionCoordination implements SplitTransactionCoordinat
}
Thread.sleep(100);
spins++;
+ if (spins > SPIN_WAIT_TIMEOUT) {
+ throw new IOException("Waiting time for Split Transaction exceeded for region: "
+ + parent.getRegionInfo().getRegionNameAsString());
+ }
byte[] data = ZKAssign.getDataNoWatch(watcher, node, stat);
if (data == null) {
throw new IOException("Data is null, splitting node " + node + " no longer exists");
@@ -222,9 +233,14 @@ public class ZKSplitTransactionCoordination implements SplitTransactionCoordinat
// Tell master about split by updating zk. If we fail, abort.
if (coordinationManager.getServer() != null) {
try {
- zstd.setZnodeVersion(transitionSplittingNode(parent.getRegionInfo(), a.getRegionInfo(),
+ int newNodeVersion = transitionSplittingNode(parent.getRegionInfo(), a.getRegionInfo(),
b.getRegionInfo(), coordinationManager.getServer().getServerName(), zstd,
- RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT));
+ RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT);
+ if (newNodeVersion == -1) {
+ throw new IOException("Notifying master of RS split failed for region: "
+ + parent.getRegionInfo().getRegionNameAsString());
+ }
+ zstd.setZnodeVersion(newNodeVersion);
int spins = 0;
// Now wait for the master to process the split. We know it's done