You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2010/05/17 18:48:57 UTC

svn commit: r945237 - in /hadoop/hbase/branches/0.20: ./ src/java/org/apache/hadoop/hbase/master/

Author: stack
Date: Mon May 17 16:48:57 2010
New Revision: 945237

URL: http://svn.apache.org/viewvc?rev=945237&view=rev
Log:
HBASE-2482 regions in transition do not get reassigned by master when RS crashes

Modified:
    hadoop/hbase/branches/0.20/CHANGES.txt
    hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/HMaster.java
    hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java
    hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/RegionManager.java
    hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ServerManager.java

Modified: hadoop/hbase/branches/0.20/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/CHANGES.txt?rev=945237&r1=945236&r2=945237&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/CHANGES.txt (original)
+++ hadoop/hbase/branches/0.20/CHANGES.txt Mon May 17 16:48:57 2010
@@ -5,6 +5,8 @@ Release 0.20.5
                (Todd Lipcon via Stack)
    HBASE-2521  no license headers in 5 files
    HBASE-2503  PriorityQueue isn't thread safe, KeyValueHeap uses it that way
+   HBASE-2482  Regions in transition do not get reassigned by master when RS
+               crashes (Todd Lipcon via Stack)
 
 Release 0.20.4 - Mon May  3 16:16:02 PDT 2010
   INCOMPATIBLE CHANGES

Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/HMaster.java?rev=945237&r1=945236&r2=945237&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/HMaster.java Mon May 17 16:48:57 2010
@@ -35,6 +35,8 @@ import java.util.concurrent.DelayQueue;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.PriorityBlockingQueue;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -158,6 +160,7 @@ public class HMaster extends Thread impl
   RegionManager regionManager;
   
   private MasterMetrics metrics;
+  final Lock splitLogLock = new ReentrantLock();
 
   /** 
    * Build the HMaster out of a raw configuration item.
@@ -609,14 +612,14 @@ public class HMaster extends Thread impl
       if(this.serverManager.getServerInfo(serverName) == null) {
         LOG.info("Log folder doesn't belong " +
             "to a known region server, splitting");
-        this.regionManager.splitLogLock.lock();
+        this.splitLogLock.lock();
         Path logDir =
           new Path(this.rootdir, HLog.getHLogDirectoryName(serverName));
         try {
           HLog.splitLog(this.rootdir, logDir, this.fs,
               getConfiguration());
         } finally {
-          this.regionManager.splitLogLock.unlock();
+          this.splitLogLock.unlock();
         }
       } else {
         LOG.info("Log folder belongs to an existing region server");

Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java?rev=945237&r1=945236&r2=945237&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java (original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java Mon May 17 16:48:57 2010
@@ -23,6 +23,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.hadoop.fs.Path;
@@ -36,6 +37,7 @@ import org.apache.hadoop.hbase.ipc.HRegi
 import org.apache.hadoop.hbase.regionserver.HLog;
 import org.apache.hadoop.hbase.regionserver.HRegion;
 import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.master.RegionManager.RegionState;
 
 /** 
  * Instantiated when a server's lease has expired, meaning it has crashed.
@@ -80,6 +82,7 @@ class ProcessServerShutdown extends Regi
 
     // check to see if I am responsible for either ROOT or any of the META tables.
 
+    // TODO Why do we do this now instead of at processing time?
     closeMetaRegions();
   }
 
@@ -109,6 +112,19 @@ class ProcessServerShutdown extends Regi
     }
   }
 
+  private void closeRegionsInTransition() {    
+    Map<String, RegionState> inTransition =
+      master.regionManager.getRegionsInTransitionOnServer(deadServer);
+    for (Map.Entry<String, RegionState> entry : inTransition.entrySet()) {
+      String regionName = entry.getKey();
+      RegionState state = entry.getValue();
+      
+      LOG.info("Region " + regionName + " was in transition " +
+          state + " on dead server " + deadServer + " - marking unassigned");
+      master.regionManager.setUnassigned(state.getRegionInfo(), true);
+    }
+  }
+  
   @Override
   public String toString() {
     return "ProcessServerShutdown of " + this.deadServer;
@@ -275,14 +291,14 @@ class ProcessServerShutdown extends Regi
     if (!logSplit) {
       // Process the old log file
       if (master.fs.exists(oldLogDir)) {
-        if (!master.regionManager.splitLogLock.tryLock()) {
+        if (!master.splitLogLock.tryLock()) {
           return false;
         }
         try {
           HLog.splitLog(master.rootdir, oldLogDir, master.fs,
             master.getConfiguration());
         } finally {
-          master.regionManager.splitLogLock.unlock();
+          master.splitLogLock.unlock();
         }
       }
       logSplit = true;
@@ -347,6 +363,9 @@ class ProcessServerShutdown extends Regi
           Bytes.toString(r.getRegionName()) + " on " + r.getServer());
       }
     }
+    
+    closeRegionsInTransition();
+
     // Remove this server from dead servers list.  Finished splitting logs.
     this.master.serverManager.removeDeadServer(deadServer);
     if (LOG.isDebugEnabled()) {

Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/RegionManager.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/RegionManager.java?rev=945237&r1=945236&r2=945237&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/RegionManager.java (original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/RegionManager.java Mon May 17 16:48:57 2010
@@ -22,6 +22,7 @@ package org.apache.hadoop.hbase.master;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
@@ -33,8 +34,6 @@ import java.util.TreeMap;
 import java.util.concurrent.ConcurrentSkipListMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
-import java.util.concurrent.locks.Lock;
-import java.util.concurrent.locks.ReentrantLock;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -68,8 +67,6 @@ class RegionManager implements HConstant
   
   private volatile boolean safeMode = true;
   
-  final Lock splitLogLock = new ReentrantLock();
-  
   private final RootScanner rootScannerThread;
   final MetaScanner metaScannerThread;
   
@@ -168,8 +165,8 @@ class RegionManager implements HConstant
     unsetRootRegion();
     if (!master.shutdownRequested.get()) {
       synchronized (regionsInTransition) {
-        RegionState s = new RegionState(HRegionInfo.ROOT_REGIONINFO);
-        s.setUnassigned();
+        RegionState s = new RegionState(HRegionInfo.ROOT_REGIONINFO,
+            RegionState.State.UNASSIGNED);
         regionsInTransition.put(
             HRegionInfo.ROOT_REGIONINFO.getRegionNameAsString(), s);
         LOG.info("ROOT inserted into regionsInTransition");
@@ -587,6 +584,23 @@ class RegionManager implements HConstant
     }
     return false;
   }
+  
+  /**
+   * Return a map of the regions in transition on a server.
+   * Returned map entries are region name -> RegionState
+   */
+  Map<String, RegionState> getRegionsInTransitionOnServer(String serverName) {
+    Map<String, RegionState> ret = new HashMap<String, RegionState>();
+    synchronized (regionsInTransition) {
+      for (Map.Entry<String, RegionState> entry : regionsInTransition.entrySet()) {
+        RegionState rs = entry.getValue();
+        if (serverName.equals(rs.getServerName())) {
+          ret.put(entry.getKey(), rs);
+        }
+      }
+    }
+    return ret;
+  }
 
   /**
    * Stop the root and meta scanners so that the region servers serving meta
@@ -736,8 +750,7 @@ class RegionManager implements HConstant
     byte [] regionName = region.getRegionName();
     
     Put put = new Put(regionName);
-    byte [] infoBytes = Writables.getBytes(info);
-    String infoString = new String(infoBytes);
+    
     put.add(CATALOG_FAMILY, REGIONINFO_QUALIFIER, Writables.getBytes(info));
     server.put(metaRegionName, put);
     
@@ -844,6 +857,10 @@ class RegionManager implements HConstant
             && !s.isUnassigned()
             && s.getServerName() != null
             && s.getServerName().equals(server.toString())) {
+          // TODO this code appears to be entirely broken, since
+          // server.toString() has no start code, but s.getServerName()
+          // does!
+          LOG.fatal("I DONT BELIEVE YOU WILL EVER SEE THIS!");
           // Has an outstanding meta region to be assigned.
           return true;
         }
@@ -976,7 +993,7 @@ class RegionManager implements HConstant
     synchronized (this.regionsInTransition) {
       s = regionsInTransition.get(info.getRegionNameAsString());
       if (s == null) {
-        s = new RegionState(info);
+        s = new RegionState(info, RegionState.State.UNASSIGNED);
         regionsInTransition.put(info.getRegionNameAsString(), s);
       }
     }
@@ -1058,7 +1075,7 @@ class RegionManager implements HConstant
       RegionState s =
         this.regionsInTransition.get(regionInfo.getRegionNameAsString());
       if (s == null) {
-        s = new RegionState(regionInfo);
+        s = new RegionState(regionInfo, RegionState.State.CLOSING);
       }
       // If region was asked to open before getting here, we could be taking
       // the wrong server name
@@ -1530,22 +1547,30 @@ class RegionManager implements HConstant
    * note on regionsInTransition data member above for listing of state
    * transitions.
    */
-  private static class RegionState implements Comparable<RegionState> {
+  static class RegionState implements Comparable<RegionState> {
     private final HRegionInfo regionInfo;
-    private volatile boolean unassigned = false;
-    private volatile boolean pendingOpen = false;
-    private volatile boolean open = false;
-    private volatile boolean closing = false;
-    private volatile boolean pendingClose = false;
-    private volatile boolean closed = false;
-    private volatile boolean offlined = false;
+    
+    enum State {
+      UNASSIGNED, // awaiting a server to be assigned
+      PENDING_OPEN, // told a server to open, hasn't opened yet
+      OPEN, // has been opened on RS, but not yet marked in META/ROOT
+      CLOSING, // a msg has been enqueued to close ths region, but not delivered to RS yet
+      PENDING_CLOSE, // msg has been delivered to RS to close this region
+      CLOSED // region has been closed but not yet marked in meta
+      
+    }
+    
+    private State state;
+    
+    private boolean isOfflined;
     
     /* Set when region is assigned or closing */
-    private volatile String serverName = null;
+    private String serverName = null;
 
     /* Constructor */
-    RegionState(HRegionInfo info) {
+    RegionState(HRegionInfo info, State state) {
       this.regionInfo = info;
+      this.state = state;
     }
     
     synchronized HRegionInfo getRegionInfo() {
@@ -1567,14 +1592,16 @@ class RegionManager implements HConstant
      * @return true if the region is being opened
      */
     synchronized boolean isOpening() {
-      return this.unassigned || this.pendingOpen || this.open;
+      return state == State.UNASSIGNED || 
+        state == State.PENDING_OPEN ||
+        state == State.OPEN;
     }
 
     /*
      * @return true if region is unassigned
      */
     synchronized boolean isUnassigned() {
-      return unassigned;
+      return state == State.UNASSIGNED;
     }
 
     /*
@@ -1583,120 +1610,84 @@ class RegionManager implements HConstant
      * called unless it is safe to do so.
      */
     synchronized void setUnassigned() {
-      this.unassigned = true;
-      this.pendingOpen = false;
-      this.open = false;
-      this.closing = false;
-      this.pendingClose = false;
-      this.closed = false;
-      this.offlined = false;
+      state = State.UNASSIGNED;
       this.serverName = null;
     }
 
     synchronized boolean isPendingOpen() {
-      return pendingOpen;
+      return state == State.PENDING_OPEN;
     }
 
     /*
      * @param serverName Server region was assigned to.
      */
     synchronized void setPendingOpen(final String serverName) {
-      if (!this.unassigned) {
+      if (state != State.UNASSIGNED) {
         LOG.warn("Cannot assign a region that is not currently unassigned. " +
           "FIX!! State: " + toString());
       }
-      this.unassigned = false;
-      this.pendingOpen = true;
-      this.open = false;
-      this.closing = false;
-      this.pendingClose = false;
-      this.closed = false;
-      this.offlined = false;
+      state = State.PENDING_OPEN;
       this.serverName = serverName;
     }
 
     synchronized boolean isOpen() {
-      return open;
+      return state == State.OPEN;
     }
 
     synchronized void setOpen() {
-      if (!pendingOpen) {
+      if (state != State.PENDING_OPEN) {
         LOG.warn("Cannot set a region as open if it has not been pending. " +
           "FIX!! State: " + toString());
       }
-      this.unassigned = false;
-      this.pendingOpen = false;
-      this.open = true;
-      this.closing = false;
-      this.pendingClose = false;
-      this.closed = false;
-      this.offlined = false;
+      state = State.OPEN;
     }
 
     synchronized boolean isClosing() {
-      return closing;
+      return state == State.CLOSING;
     }
 
     synchronized void setClosing(String serverName, boolean setOffline) {
-      this.unassigned = false;
-      this.pendingOpen = false;
-      this.open = false;
-      this.closing = true;
-      this.pendingClose = false;
-      this.closed = false;
-      this.offlined = setOffline;
+      state = State.CLOSING;
       this.serverName = serverName;
+      this.isOfflined = setOffline;
     }
     
     synchronized boolean isPendingClose() {
-      return this.pendingClose;
+      return state == State.PENDING_CLOSE;
     }
 
     synchronized void setPendingClose() {
-      if (!closing) {
+      if (state != State.CLOSING) {
         LOG.warn("Cannot set a region as pending close if it has not been " +
           "closing.  FIX!! State: " + toString());
       }
-      this.unassigned = false;
-      this.pendingOpen = false;
-      this.open = false;
-      this.closing = false;
-      this.pendingClose = true;
-      this.closed = false;
+      state = State.PENDING_CLOSE;
     }
 
     synchronized boolean isClosed() {
-      return this.closed;
+      return state == State.CLOSED;
     }
     
     synchronized void setClosed() {
-      if (!pendingClose && !pendingOpen && !closing) {
+      if (state != State.PENDING_CLOSE &&
+          state != State.PENDING_OPEN &&
+          state != State.CLOSING) {
         throw new IllegalStateException(
             "Cannot set a region to be closed if it was not already marked as" +
-            " pending close, pending open or closing. State: " + toString());
+            " pending close, pending open or closing. State: " + this);
       }
-      this.unassigned = false;
-      this.pendingOpen = false;
-      this.open = false;
-      this.closing = false;
-      this.pendingClose = false;
-      this.closed = true;
+      state = State.CLOSED;
     }
     
     synchronized boolean isOfflined() {
-      return this.offlined;
+      return (state == State.CLOSING ||
+        state == State.PENDING_CLOSE) && isOfflined;
     }
 
     @Override
     public synchronized String toString() {
       return ("name=" + Bytes.toString(getRegionName()) +
-          ", unassigned=" + this.unassigned +
-          ", pendingOpen=" + this.pendingOpen +
-          ", open=" + this.open +
-          ", closing=" + this.closing +
-          ", pendingClose=" + this.pendingClose +
-          ", closed=" + this.closed +
-          ", offlined=" + this.offlined);
+          ", state=" + this.state);
     }
     
     @Override

Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=945237&r1=945236&r2=945237&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ServerManager.java Mon May 17 16:48:57 2010
@@ -48,6 +48,7 @@ import org.apache.hadoop.hbase.HMsg.Type
 import org.apache.hadoop.hbase.client.Get;
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.ipc.HRegionInterface;
+import org.apache.hadoop.hbase.master.RegionManager.RegionState;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
@@ -324,8 +325,15 @@ class ServerManager implements HConstant
     }
   }
 
-  /** Region server is exiting */
+  /**
+   * Region server is exiting with a clean shutdown.
+   * 
+   * In this case, the server sends MSG_REPORT_EXITING in msgs[0] followed by
+   * a MSG_REPORT_CLOSE for each region it was serving. 
+   */
   private void processRegionServerExit(HServerInfo serverInfo, HMsg[] msgs) {
+    assert msgs[0].getType() == Type.MSG_REPORT_EXITING;
+    
     synchronized (serversToServerInfo) {
       try {
         // This method removes ROOT/META from the list and marks them to be reassigned
@@ -342,6 +350,7 @@ class ServerManager implements HConstant
             for (int i = 1; i < msgs.length; i++) {
               LOG.info("Processing " + msgs[i] + " from " +
                   serverInfo.getServerName());
+              assert msgs[i].getType() == Type.MSG_REGION_CLOSE;
               HRegionInfo info = msgs[i].getRegionInfo();
               // Meta/root region offlining is handed in removeServerInfo above.
               if (!info.isMetaRegion()) {
@@ -356,6 +365,18 @@ class ServerManager implements HConstant
               }
             }
           }
+          
+          // There should not be any regions in transition for this server - the
+          // server should finish transitions itself before closing
+          Map<String, RegionState> inTransition =
+            master.regionManager.getRegionsInTransitionOnServer(
+            serverInfo.getServerName());
+          for (Map.Entry<String, RegionState> entry : inTransition.entrySet()) {
+            LOG.warn("Region server " + serverInfo.getServerName() +
+                " shut down with region " + entry.getKey() + " in transition " +
+                "state " + entry.getValue());
+            master.regionManager.setUnassigned(entry.getValue().getRegionInfo(), true);
+          }
         }
         // We don't need to return anything to the server because it isn't
         // going to do any more work.