You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by te...@apache.org on 2012/01/14 11:55:04 UTC

svn commit: r1231479 - in /hbase/branches/0.90: ./ src/main/java/org/apache/hadoop/hbase/master/

Author: tedyu
Date: Sat Jan 14 10:55:03 2012
New Revision: 1231479

URL: http://svn.apache.org/viewvc?rev=1231479&view=rev
Log:
HBASE-5179  Concurrent processing of processFaileOver and ServerShutdownHandler may cause region to
               be assigned before log splitting is completed, causing data loss (Chunhui)

Modified:
    hbase/branches/0.90/CHANGES.txt
    hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java
    hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
    hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java

Modified: hbase/branches/0.90/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/CHANGES.txt?rev=1231479&r1=1231478&r2=1231479&view=diff
==============================================================================
--- hbase/branches/0.90/CHANGES.txt (original)
+++ hbase/branches/0.90/CHANGES.txt Sat Jan 14 10:55:03 2012
@@ -24,6 +24,8 @@ Release 0.90.6 - Unreleased
    HBASE-5160  Backport HBASE-4397 - -ROOT-, .META. tables stay offline for too long in 
                recovery phase after all RSs are shutdown at the same time (Ram)
    HBASE-5160  Addendum adding the removed method in AssignmentManager (Ram)
+   HBASE-5179  Concurrent processing of processFaileOver and ServerShutdownHandler may cause region to
+               be assigned before log splitting is completed, causing data loss (Chunhui)
 
   IMPROVEMENT
    HBASE-5102  Change the default value of the property "hbase.connection.per.config" to false in

Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1231479&r1=1231478&r2=1231479&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Sat Jan 14 10:55:03 2012
@@ -193,11 +193,13 @@ public class AssignmentManager extends Z
   /**
    * Handle failover.  Restore state from META and ZK.  Handle any regions in
    * transition.  Presumes <code>.META.</code> and <code>-ROOT-</code> deployed.
+   * @param onlineServers onlined servers when master start
    * @throws KeeperException
    * @throws IOException
    * @throws InterruptedException 
    */
-  void processFailover() throws KeeperException, IOException, InterruptedException {
+  void processFailover(final Set<String> onlineServers)
+  throws KeeperException, IOException, InterruptedException {
     // Concurrency note: In the below the accesses on regionsInTransition are
     // outside of a synchronization block where usually all accesses to RIT are
     // synchronized.  The presumption is that in this case it is safe since this
@@ -218,7 +220,7 @@ public class AssignmentManager extends Z
     // Scan META to build list of existing regions, servers, and assignment
     // Returns servers who have not checked in (assumed dead) and their regions
     Map<String, List<Pair<HRegionInfo, Result>>> deadServers =
-      rebuildUserRegions();
+      rebuildUserRegions(onlineServers);
     // Process list of dead servers; note this will add regions to the RIT.
     // processRegionsInTransition will read them and assign them out.
     processDeadServers(deadServers);
@@ -1561,12 +1563,15 @@ public class AssignmentManager extends Z
    * <p>
    * Returns a map of servers that are not found to be online and the regions
    * they were hosting.
+   * @param onlineServers if one region's location belongs to onlineServers, it
+   *          doesn't need to be assigned
    * @return map of servers not online to their assigned regions, as stored
    *         in META
    * @throws IOException
  * @throws KeeperException 
    */
-  private Map<String, List<Pair<HRegionInfo,Result>>> rebuildUserRegions()
+  private Map<String, List<Pair<HRegionInfo,Result>>> rebuildUserRegions(
+      final Set<String> onlineServers)
   throws IOException, KeeperException {
     // Region assignment from META
     List<Result> results = MetaReader.fullScanOfResults(catalogTracker);
@@ -1593,7 +1598,7 @@ public class AssignmentManager extends Z
         if (checkIfRegionBelongsToDisabling(regionInfo)) {
           disablingTables.add(disablingTableName);
         }
-      } else if (!serverManager.isServerOnline(regionLocation.getServerName())) {
+      } else if (!onlineServers.contains(regionLocation.getServerName())) {
         // Region is located on a server that isn't online
         List<Pair<HRegionInfo,Result>> offlineRegions =
           offlineServers.get(regionLocation.getServerName());

Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java?rev=1231479&r1=1231478&r2=1231479&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java (original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java Sat Jan 14 10:55:03 2012
@@ -22,15 +22,14 @@ package org.apache.hadoop.hbase.master;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
 import java.util.Set;
 
 import org.apache.commons.lang.NotImplementedException;
 import org.apache.hadoop.hbase.HServerInfo;
 
 /**
- * Class to hold dead servers list and utility querying dead server list.
+ * Class to hold dead servers list, utility querying dead server list and the
+ * dead servers being processed by the ServerShutdownHandler.
  */
 public class DeadServer implements Set<String> {
   /**
@@ -41,7 +40,11 @@ public class DeadServer implements Set<S
    * because by then, its regions have probably been reassigned.
    */
   private final Set<String> deadServers = new HashSet<String>();
-
+  /**
+   * Set of dead servers under processing by the ServerShutdownHander.
+   */
+  private final Set<String> deadServersUnderProcessing = new HashSet<String>();
+  
   /** Maximum number of dead servers to keep track of */
   private final int maxDeadServers;
 
@@ -111,13 +114,22 @@ public class DeadServer implements Set<S
     return clone;
   }
 
+  synchronized Set<String> getDeadServersBeingProcessed() {
+    Set<String> clone = new HashSet<String>(
+        this.deadServersUnderProcessing.size());
+    clone.addAll(this.deadServersUnderProcessing);
+    return clone;
+  }
+  
   public synchronized boolean add(String e) {
     this.numProcessing++;
+    deadServersUnderProcessing.add(e);
     return deadServers.add(e);
   }
 
   public synchronized void finish(String e) {
     this.numProcessing--;
+    deadServersUnderProcessing.remove(e);
   }
 
   public synchronized int size() {
@@ -179,4 +191,4 @@ public class DeadServer implements Set<S
   public synchronized String toString() {
     return this.deadServers.toString();
   }
-}
\ No newline at end of file
+}

Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1231479&r1=1231478&r2=1231479&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Sat Jan 14 10:55:03 2012
@@ -25,8 +25,10 @@ import java.lang.reflect.InvocationTarge
 import java.net.InetSocketAddress;
 import java.net.UnknownHostException;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.atomic.AtomicReference;
 
 import org.apache.commons.logging.Log;
@@ -51,7 +53,6 @@ import org.apache.hadoop.hbase.UnknownRe
 import org.apache.hadoop.hbase.catalog.CatalogTracker;
 import org.apache.hadoop.hbase.catalog.MetaEditor;
 import org.apache.hadoop.hbase.catalog.MetaReader;
-import org.apache.hadoop.hbase.client.HConnectionManager;
 import org.apache.hadoop.hbase.client.MetaScanner;
 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
 import org.apache.hadoop.hbase.client.Result;
@@ -374,10 +375,16 @@ implements HMasterInterface, HMasterRegi
 
     // Wait for region servers to report in.  Returns count of regions.
     int regionCount = this.serverManager.waitForRegionServers();
-
+    
+    Set<String> knownServers = new HashSet<String>();
+    knownServers.addAll(serverManager.getOnlineServers().keySet());
+    if (this.serverManager.areDeadServersInProgress()) {
+      // Dead servers are processing, their logs would be split by
+      // ServerShutdownHandler
+      knownServers.addAll(serverManager.getDeadServersBeingProcessed());
+    }
     // TODO: Should do this in background rather than block master startup
-    this.fileSystemManager.
-      splitLogAfterStartup(this.serverManager.getOnlineServers());
+    this.fileSystemManager.splitLogAfterStartup(knownServers);
 
     // Make sure root and meta assigned before proceeding.
     assignRootAndMeta();
@@ -393,7 +400,7 @@ implements HMasterInterface, HMasterRegi
       this.assignmentManager.assignAllUserRegions();
     } else {
       LOG.info("Master startup proceeding: master failover");
-      this.assignmentManager.processFailover();
+      this.assignmentManager.processFailover(knownServers);
     }
 
     // Start balancer and meta catalog janitor after meta and regions have

Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java?rev=1231479&r1=1231478&r2=1231479&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java Sat Jan 14 10:55:03 2012
@@ -20,7 +20,7 @@
 package org.apache.hadoop.hbase.master;
 
 import java.io.IOException;
-import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.locks.Lock;
 import java.util.concurrent.locks.ReentrantLock;
 
@@ -155,7 +155,7 @@ public class MasterFileSystem {
    * @param onlineServers Map of online servers keyed by
    * {@link HServerInfo#getServerName()}
    */
-  void splitLogAfterStartup(final Map<String, HServerInfo> onlineServers) {
+  void splitLogAfterStartup(final Set<String> onlineServers) {
     Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME);
     try {
       if (!this.fs.exists(logsDirPath)) {
@@ -176,7 +176,7 @@ public class MasterFileSystem {
     }
     for (FileStatus status : logFolders) {
       String serverName = status.getPath().getName();
-      if (onlineServers.get(serverName) == null) {
+      if (!onlineServers.contains(serverName)) {
         LOG.info("Log folder " + status.getPath() + " doesn't belong " +
           "to a known region server, splitting");
         splitLog(serverName);

Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1231479&r1=1231478&r2=1231479&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Sat Jan 14 10:55:03 2012
@@ -49,7 +49,6 @@ import org.apache.hadoop.hbase.ipc.HRegi
 import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
 import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
 import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
-import org.apache.hadoop.hbase.regionserver.Leases.LeaseStillHeldException;
 
 /**
  * The ServerManager class manages info about region servers - HServerInfo,
@@ -420,11 +419,22 @@ public class ServerManager {
     }
   }
 
+  /**
+   * @return Set of known dead servers.
+   */
   public Set<String> getDeadServers() {
     return this.deadservers.clone();
   }
 
   /**
+   * @return Set of dead servers which are being processed by the
+   *         ServerShutdownHander.
+   */
+  public Set<String> getDeadServersBeingProcessed() {
+    return this.deadservers.getDeadServersBeingProcessed();
+  }
+
+  /**
    * Checks if any dead servers are currently in progress.
    * @return true if any RS are being processed as dead, false if not
    */