You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by jd...@apache.org on 2011/03/15 22:47:48 UTC
svn commit: r1081957 - in /hbase/trunk: CHANGES.txt
src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeper.java
src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
Author: jdcryans
Date: Tue Mar 15 21:47:48 2011
New Revision: 1081957
URL: http://svn.apache.org/viewvc?rev=1081957&view=rev
Log:
HBASE-3648 [replication] failover is sloppy with znodes
HBASE-3596 [replication] Wait a few seconds before transferring queues
Modified:
hbase/trunk/CHANGES.txt
hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeper.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
Modified: hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1081957&r1=1081956&r2=1081957&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Tue Mar 15 21:47:48 2011
@@ -162,6 +162,7 @@ Release 0.90.2 - Unreleased
HBASE-3636 a bug about deciding whether this key is a new key for the ROWCOL
bloomfilter (Liyin Tang via Stack)
HBASE-3639 FSUtils.getRootDir should qualify path
+ HBASE-3648 [replication] failover is sloppy with znodes
IMPROVEMENTS
HBASE-3542 MultiGet methods in Thrift
@@ -171,6 +172,7 @@ Release 0.90.2 - Unreleased
(Harsh J Chouraria via Stack)
HBASE-3625 improve/fix support excluding Tests via Maven -D property
(Alejandro Abdelnur via todd)
+ HBASE-3596 [replication] Wait a few seconds before transferring queues
Release 0.90.1 - Unreleased
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeper.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeper.java?rev=1081957&r1=1081956&r2=1081957&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeper.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeper.java Tue Mar 15 21:47:48 2011
@@ -516,7 +516,18 @@ public class ReplicationZookeeper {
String p = ZKUtil.joinZNode(parent, RS_LOCK_ZNODE);
ZKUtil.createAndWatch(this.zookeeper, p, Bytes.toBytes(rsServerNameZnode));
} catch (KeeperException e) {
- LOG.info("Failed lock other rs", e);
+ // This exception will pop up if the znode under which we're trying to
+ // create the lock is already deleted by another region server, meaning
+ // that the transfer already occurred.
+ // NoNode => transfer is done and znodes are already deleted
+ // NodeExists => lock znode already created by another RS
+ if (e instanceof KeeperException.NoNodeException ||
+ e instanceof KeeperException.NodeExistsException) {
+ LOG.info("Won't transfer the queue," +
+ " another RS took care of it because of: " + e.getMessage());
+ } else {
+ LOG.info("Failed lock other rs", e);
+ }
return false;
}
return true;
@@ -597,10 +608,30 @@ public class ReplicationZookeeper {
* @param znode
*/
public void deleteRsQueues(String znode) {
+ String fullpath = ZKUtil.joinZNode(rsZNode, znode);
try {
- ZKUtil.deleteNodeRecursively(this.zookeeper,
- ZKUtil.joinZNode(rsZNode, znode));
+ List<String> clusters =
+ ZKUtil.listChildrenNoWatch(this.zookeeper, fullpath);
+ for (String cluster : clusters) {
+ // We'll delete it later
+ if (cluster.equals(RS_LOCK_ZNODE)) {
+ continue;
+ }
+ String fullClusterPath = ZKUtil.joinZNode(fullpath, cluster);
+ ZKUtil.deleteNodeRecursively(this.zookeeper, fullClusterPath);
+ }
+ // Finish cleaning up
+ ZKUtil.deleteNodeRecursively(this.zookeeper, fullpath);
} catch (KeeperException e) {
+ if (e instanceof KeeperException.NoNodeException ||
+ e instanceof KeeperException.NotEmptyException) {
+ // Testing a special case where another region server was able to
+ // create a lock just after we deleted it, but then was also able to
+ // delete the RS znode before us or its lock znode is still there.
+ if (e.getPath().equals(fullpath)) {
+ return;
+ }
+ }
this.abortable.abort("Failed delete of " + znode, e);
}
}
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java?rev=1081957&r1=1081956&r2=1081957&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java Tue Mar 15 21:47:48 2011
@@ -76,6 +76,8 @@ public class ReplicationSourceManager {
private final Path logDir;
// Path to the hlog archive
private final Path oldLogDir;
+ // The number of ms that we wait before moving znodes, HBASE-3596
+ private final long sleepBeforeFailover;
/**
* Creates a replication manager and sets the watch on all the other
@@ -105,6 +107,7 @@ public class ReplicationSourceManager {
this.fs = fs;
this.logDir = logDir;
this.oldLogDir = oldLogDir;
+ this.sleepBeforeFailover = conf.getLong("replication.sleep.before.failover", 2000);
this.zkHelper.registerRegionServerListener(
new OtherRegionServerWatcher(this.zkHelper.getZookeeperWatcher()));
List<String> otherRSs =
@@ -291,6 +294,14 @@ public class ReplicationSourceManager {
* @param rsZnode
*/
public void transferQueues(String rsZnode) {
+ // Wait a bit before transferring the queues, we may be shutting down.
+ // This sleep may not be enough in some cases.
+ try {
+ Thread.sleep(this.sleepBeforeFailover);
+ } catch (InterruptedException e) {
+ LOG.warn("Interrupted while waiting before transferring a queue.");
+ Thread.currentThread().interrupt();
+ }
// We try to lock that rs' queue directory
if (this.stopper.isStopped()) {
LOG.info("Not transferring queue since we are shutting down");