You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by jd...@apache.org on 2012/07/20 00:23:27 UTC

svn commit: r1363571 - in /hbase/branches/0.92: CHANGES.txt src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java

Author: jdcryans
Date: Thu Jul 19 22:23:26 2012
New Revision: 1363571

URL: http://svn.apache.org/viewvc?rev=1363571&view=rev
Log:
HBASE-6319  ReplicationSource can call terminate on itself and deadlock
HBASE-6325  [replication] Race in ReplicationSourceManager.init can initiate a failover even if the node is alive

Modified:
    hbase/branches/0.92/CHANGES.txt
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java

Modified: hbase/branches/0.92/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/CHANGES.txt?rev=1363571&r1=1363570&r2=1363571&view=diff
==============================================================================
--- hbase/branches/0.92/CHANGES.txt (original)
+++ hbase/branches/0.92/CHANGES.txt Thu Jul 19 22:23:26 2012
@@ -92,6 +92,9 @@ Release 0.92.2 - Unreleased
    HBASE-6357  Failed distributed log splitting stuck on master web UI
    HBASE-6382  Upgrade Jersey to 1.8 to match Hadoop 1 and 2
    HBASE-6392  UnknownRegionException blocks hbck from sideline big overlap regions
+   HBASE-6319  ReplicationSource can call terminate on itself and deadlock
+   HBASE-6325  [replication] Race in ReplicationSourceManager.init can initiate a failover even if the node is alive
+
 
   IMPROVEMENTS
    HBASE-5592  Make it easier to get a table from shell (Ben West)

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java?rev=1363571&r1=1363570&r2=1363571&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java Thu Jul 19 22:23:26 2012
@@ -705,7 +705,10 @@ public class ReplicationSource extends T
           + " because an error occurred: " + reason, cause);
     }
     this.running = false;
-    Threads.shutdown(this, this.sleepForRetries);
+    // Only wait for the thread to die if it's not us
+    if (!Thread.currentThread().equals(this)) {
+      Threads.shutdown(this, this.sleepForRetries);
+    }
   }
 
   /**

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java?rev=1363571&r1=1363570&r2=1363571&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java Thu Jul 19 22:23:26 2012
@@ -78,7 +78,7 @@ public class ReplicationSourceManager {
   // The path to the latest log we saw, for new coming sources
   private Path latestPath;
   // List of all the other region servers in this cluster
-  private final List<String> otherRegionServers;
+  private final List<String> otherRegionServers = new ArrayList<String>();
   // Path to the hlogs directories
   private final Path logDir;
   // Path to the hlog archive
@@ -119,12 +119,9 @@ public class ReplicationSourceManager {
     this.sleepBeforeFailover = conf.getLong("replication.sleep.before.failover", 2000);
     this.zkHelper.registerRegionServerListener(
         new OtherRegionServerWatcher(this.zkHelper.getZookeeperWatcher()));
-    List<String> otherRSs =
-        this.zkHelper.getRegisteredRegionServers();
     this.zkHelper.registerRegionServerListener(
         new PeersWatcher(this.zkHelper.getZookeeperWatcher()));
     this.zkHelper.listPeersIdsAndWatch();
-    this.otherRegionServers = otherRSs == null ? new ArrayList<String>() : otherRSs;
     // It's preferable to failover 1 RS at a time, but with good zk servers
     // more could be processed at the same time.
     int nbWorkers = conf.getInt("replication.executor.workers", 1);
@@ -179,6 +176,7 @@ public class ReplicationSourceManager {
       return;
     }
     synchronized (otherRegionServers) {
+      refreshOtherRegionServersList();
       LOG.info("Current list of replicators: " + currentReplicators
           + " other RSs: " + otherRegionServers);
     }
@@ -398,6 +396,27 @@ public class ReplicationSourceManager {
   }
 
   /**
+   * Reads the list of region servers from ZK and atomically clears our
+   * local view of it and replaces it with the updated list.
+   * 
+   * @return true if the local list of the other region servers was updated
+   * with the ZK data (even if it was empty),
+   * false if the data was missing in ZK
+   */
+  private boolean refreshOtherRegionServersList() {
+    List<String> newRsList = zkHelper.getRegisteredRegionServers();
+    if (newRsList == null) {
+      return false;
+    } else {
+      synchronized (otherRegionServers) {
+        otherRegionServers.clear();
+        otherRegionServers.addAll(newRsList);
+      }
+    }
+    return true;
+  }
+
+  /**
    * Watcher used to be notified of the other region server's death
    * in the local cluster. It initiates the process to transfer the queues
    * if it is able to grab the lock.
@@ -416,7 +435,7 @@ public class ReplicationSourceManager {
      * @param path full path of the new node
      */
     public void nodeCreated(String path) {
-      refreshRegionServersList(path);
+      refreshListIfRightPath(path);
     }
 
     /**
@@ -427,7 +446,7 @@ public class ReplicationSourceManager {
       if (stopper.isStopped()) {
         return;
       }
-      boolean cont = refreshRegionServersList(path);
+      boolean cont = refreshListIfRightPath(path);
       if (!cont) {
         return;
       }
@@ -443,23 +462,14 @@ public class ReplicationSourceManager {
       if (stopper.isStopped()) {
         return;
       }
-      refreshRegionServersList(path);
+      refreshListIfRightPath(path);
     }
 
-    private boolean refreshRegionServersList(String path) {
+    private boolean refreshListIfRightPath(String path) {
       if (!path.startsWith(zkHelper.getZookeeperWatcher().rsZNode)) {
         return false;
       }
-      List<String> newRsList = (zkHelper.getRegisteredRegionServers());
-      if (newRsList == null) {
-        return false;
-      } else {
-        synchronized (otherRegionServers) {
-          otherRegionServers.clear();
-          otherRegionServers.addAll(newRsList);
-        }
-      }
-      return true;
+      return refreshOtherRegionServersList();
     }
   }