You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2009/01/07 22:03:20 UTC

svn commit: r732491 - in /hadoop/hbase/trunk: ./ src/java/org/apache/hadoop/hbase/ src/java/org/apache/hadoop/hbase/master/

Author: stack
Date: Wed Jan  7 13:03:20 2009
New Revision: 732491

URL: http://svn.apache.org/viewvc?rev=732491&view=rev
Log:
HBASE-1099 Regions assigned while master is splitting logs of recently crashed server; regionserver tries to execute incomplete log

Modified:
    hadoop/hbase/trunk/CHANGES.txt
    hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/Leases.java
    hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/BaseScanner.java
    hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java
    hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java
    hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java
    hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java
    hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java

Modified: hadoop/hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/CHANGES.txt?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/CHANGES.txt (original)
+++ hadoop/hbase/trunk/CHANGES.txt Wed Jan  7 13:03:20 2009
@@ -131,6 +131,8 @@
    HBASE-1083  Will keep scheduling major compactions if last time one ran, we
                didn't.
    HBASE-1101  NPE in HConnectionManager$TableServers.processBatchOfRows
+   HBASE-1099  Regions assigned while master is splitting logs of recently
+               crashed server; regionserver tries to execute incomplete log
 
   IMPROVEMENTS
    HBASE-901   Add a limit to key length, check key and value length on client side

Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/Leases.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/Leases.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/Leases.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/Leases.java Wed Jan  7 13:03:20 2009
@@ -55,7 +55,6 @@
   private final int leasePeriod;
   private final int leaseCheckFrequency;
   private volatile DelayQueue<Lease> leaseQueue = new DelayQueue<Lease>();
-
   protected final Map<String, Lease> leases = new HashMap<String, Lease>();
   private volatile boolean stopRequested = false;
 
@@ -88,15 +87,16 @@
       if (lease == null) {
         continue;
       }
-      // A lease expired
+      // A lease expired.  Run the expired code before removing from queue
+      // since its presence in queue is used to see if lease exists still.
+      if (lease.getListener() == null) {
+        LOG.error("lease listener is null for lease " + lease.getLeaseName());
+      } else {
+        lease.getListener().leaseExpired();
+      }
       synchronized (leaseQueue) {
         leases.remove(lease.getLeaseName());
-        if (lease.getListener() == null) {
-          LOG.error("lease listener is null for lease " + lease.getLeaseName());
-          continue;
-        }
       }
-      lease.getListener().leaseExpired();
     }
     close();
   }

Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/BaseScanner.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/BaseScanner.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/BaseScanner.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/BaseScanner.java Wed Jan  7 13:03:20 2009
@@ -338,17 +338,16 @@
   throws IOException {
     
     synchronized (regionManager) {
-      // Skip region - if ...
-      if(info.isOffline()                                 // offline
-          || regionManager.isOfflined(info.getRegionName())) { // queued for offline
-
+      // Skip region - if
+      if(info.isOffline() ||
+          regionManager.isOfflined(info.getRegionName())) { // queued for offline
         regionManager.removeRegion(info);
         return;
       }
       HServerInfo storedInfo = null;
+      boolean deadServerAndLogsSplit = false;
       boolean deadServer = false;
       if (serverName.length() != 0) {
-
         if (regionManager.isOfflined(info.getRegionName())) {
           // Skip if region is on kill list
           if(LOG.isDebugEnabled()) {
@@ -357,31 +356,31 @@
           }
           return;
         }
-
-        storedInfo = master.serverManager.getServerInfo(serverName);
-        deadServer = master.serverManager.isDead(serverName);
+        storedInfo = this.master.serverManager.getServerInfo(serverName);
+        deadServer = this.master.serverManager.isDead(serverName);
+        deadServerAndLogsSplit =
+          this.master.serverManager.isDeadServerLogsSplit(serverName);
       }
 
       /*
-       * If the server is a dead server or its startcode is off -- either null
+       * If the server is a dead server and its logs have been split or its
+       * not on the dead server lists and its startcode is off -- either null
        * or doesn't match the start code for the address -- then add it to the
        * list of unassigned regions IF not already there (or pending open).
        */ 
-      if ((deadServer || 
-          (storedInfo == null || storedInfo.getStartCode() != startCode)) &&
-          (!regionManager.isUnassigned(info) &&
-              !regionManager.isPending(info.getRegionName()) &&
-              !regionManager.isAssigned(info.getRegionName()))) {
-
+      if ((deadServerAndLogsSplit ||
+          (!deadServer && (storedInfo == null ||
+            (storedInfo.getStartCode() != startCode)))) &&
+          this.regionManager.assignable(info)) {
         // The current assignment is invalid
-
         if (LOG.isDebugEnabled()) {
-          LOG.debug("Current assignment of " +
-              info.getRegionNameAsString() +
-              " is not valid." +
-              (storedInfo == null ? " Server '" + serverName + "' unknown." :
+          LOG.debug("Current assignment of " + info.getRegionNameAsString() +
+            " is not valid; deadServerAndLogsSplit=" + deadServerAndLogsSplit +
+            ", deadServer=" + deadServer + ". " +
+            (storedInfo == null ? " Server '" + serverName + "' unknown." :
                 " serverInfo: " + storedInfo + ", passed startCode: " +
-                startCode + ", storedInfo.startCode: " + storedInfo.getStartCode()) +
+                startCode + ", storedInfo.startCode: " +
+                storedInfo.getStartCode()) +
           " Region is not unassigned, assigned or pending");
         }
 
@@ -389,7 +388,6 @@
         // This is only done from here if we are restarting and there is stale
         // data in the meta region. Once we are on-line, dead server log
         // recovery is handled by lease expiration and ProcessServerShutdown
-
         if (!regionManager.isInitialMetaScanComplete() &&
             serverName.length() != 0) {
           StringBuilder dirName = new StringBuilder("log_");
@@ -418,7 +416,7 @@
       }
     }
   }
-  
+
   /**
    * Notify the thread to die at the end of its next run
    */

Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java Wed Jan  7 13:03:20 2009
@@ -91,9 +91,7 @@
       synchronized (master.regionManager) {
         if (online) {
           // Bring offline regions on-line
-          if (!master.regionManager.isUnassigned(i) &&
-              !master.regionManager.isAssigned(i.getRegionName()) &&
-              !master.regionManager.isPending(i.getRegionName())) {
+          if (!master.regionManager.assignable(i)) {
             master.regionManager.setUnassigned(i, false);
           }
         } else {

Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java Wed Jan  7 13:03:20 2009
@@ -532,15 +532,14 @@
   /*
    * HMasterRegionInterface
    */
-  public MapWritable regionServerStartup(HServerInfo serverInfo) {
-    // Set the address for now even tho it will not be persisted on
-    // the HRS side.
+  public MapWritable regionServerStartup(final HServerInfo serverInfo) {
+    // Set the address for now even tho it will not be persisted on HRS side.
     String rsAddress = HBaseServer.getRemoteAddress();
-    serverInfo.setServerAddress(new HServerAddress
-        (rsAddress, serverInfo.getServerAddress().getPort()));
-    // register with server manager
-    serverManager.regionServerStartup(serverInfo);
-    // send back some config info
+    serverInfo.setServerAddress(new HServerAddress(rsAddress,
+      serverInfo.getServerAddress().getPort()));
+    // Register with server manager
+    this.serverManager.regionServerStartup(serverInfo);
+    // Send back some config info
     return createConfigurationSubset();
   }
   

Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java Wed Jan  7 13:03:20 2009
@@ -45,11 +45,13 @@
  */
 class ProcessServerShutdown extends RegionServerOperation {
   private final HServerAddress deadServer;
-  private final String deadServerName;
+  /*
+   * Cache of the server name.
+   */
+  private final String deadServerStr;
   private final boolean rootRegionServer;
   private boolean rootRegionReassigned = false;
   private Path oldLogDir;
-  private boolean logSplit;
   private boolean rootRescanned;
   
 
@@ -74,9 +76,8 @@
       boolean rootRegionServer) {
     super(master);
     this.deadServer = serverInfo.getServerAddress();
-    this.deadServerName = this.deadServer.toString();
+    this.deadServerStr = this.deadServer.toString();
     this.rootRegionServer = rootRegionServer;
-    this.logSplit = false;
     this.rootRescanned = false;
     this.oldLogDir =
       new Path(master.rootdir, HLog.getHLogDirectoryName(serverInfo));
@@ -84,13 +85,14 @@
 
   @Override
   public String toString() {
-    return "ProcessServerShutdown of " + this.deadServer.toString();
+    return "ProcessServerShutdown of " + this.deadServerStr;
   }
 
-  /** Finds regions that the dead region server was serving */
+  /** Finds regions that the dead region server was serving
+   */
   protected void scanMetaRegion(HRegionInterface server, long scannerId,
-      byte [] regionName) throws IOException {
-
+    byte [] regionName)
+  throws IOException {
     List<ToDoEntry> toDoList = new ArrayList<ToDoEntry>();
     Set<HRegionInfo> regions = new HashSet<HRegionInfo>();
     List<byte []> emptyRows = new ArrayList<byte []>();
@@ -107,14 +109,13 @@
         if (values == null || values.size() == 0) {
           break;
         }
-        
         byte [] row = values.getRow();
-        // Check server name.  If null, be conservative and treat as though
-        // region had been on shutdown server (could be null because we
-        // missed edits in hlog because hdfs does not do write-append).
+        // Check server name.  If null, skip (We used to consider it was on
+        // shutdown server but that would mean that we'd reassign regions that
+        // were already out being assigned, ones that were product of a split
+        // that happened while the shutdown was being processed.
         String serverName = Writables.cellToString(values.get(COL_SERVER));
-        if (serverName != null && serverName.length() > 0 &&
-            deadServerName.compareTo(serverName) != 0) {
+        if (serverName == null || !deadServerStr.equals(serverName)) {
           // This isn't the server you're looking for - move along
           continue;
         }
@@ -159,7 +160,7 @@
         }
       }
     } finally {
-      if(scannerId != -1L) {
+      if (scannerId != -1L) {
         try {
           server.close(scannerId);
         } catch (IOException e) {
@@ -222,21 +223,22 @@
       long scannerId =
         server.openScanner(m.getRegionName(), COLUMN_FAMILY_ARRAY,
         EMPTY_START_ROW, HConstants.LATEST_TIMESTAMP, null);
-
-        scanMetaRegion(server, scannerId, m.getRegionName());
+      scanMetaRegion(server, scannerId, m.getRegionName());
       return true;
     }
   }
 
   @Override
   protected boolean process() throws IOException {
-    LOG.info("process shutdown of server " + deadServer + ": logSplit: " +
-      this.logSplit + ", rootRescanned: " + rootRescanned +
+    boolean logSplit =
+      this.master.serverManager.isDeadServerLogsSplit(this.deadServerStr);
+    LOG.info("process shutdown of server " + this.deadServerStr +
+      ": logSplit: " +
+      logSplit + ", rootRescanned: " + rootRescanned +
       ", numberOfMetaRegions: " + 
       master.regionManager.numMetaRegions() +
       ", onlineMetaRegions.size(): " + 
       master.regionManager.numOnlineMetaRegions());
-
     if (!logSplit) {
       // Process the old log file
       if (master.fs.exists(oldLogDir)) {
@@ -250,9 +252,9 @@
           master.regionManager.splitLogLock.unlock();
         }
       }
-      logSplit = true;
+      this.master.serverManager.setDeadServerLogsSplit(this.deadServerStr);
     }
-    
+
     if (this.rootRegionServer && !this.rootRegionReassigned) {
       // avoid multiple root region reassignment 
       this.rootRegionReassigned = true;
@@ -277,7 +279,6 @@
           new MetaRegion(master.getRootRegionLocation(),
               HRegionInfo.ROOT_REGIONINFO.getRegionName(),
               HConstants.EMPTY_START_ROW), this.master).doWithRetries();
-        
       if (result == null) {
         // Master is closing - give up
         return true;
@@ -290,7 +291,6 @@
       }
       rootRescanned = true;
     }
-    
     if (!metaTableAvailable()) {
       // We can't proceed because not all meta regions are online.
       // metaAvailable() has put this request on the delayedToDoQueue
@@ -309,7 +309,11 @@
           Bytes.toString(r.getRegionName()) + " on " + r.getServer());
       }
     }
-    master.serverManager.removeDeadServer(deadServerName);
+    // Remove this server from dead servers list.  Finished splitting logs.
+    this.master.serverManager.removeDeadServer(deadServerStr);
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Removed " + deadServerStr + " from deadservers Map");
+    }
     return true;
   }
-}
+}
\ No newline at end of file

Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java Wed Jan  7 13:03:20 2009
@@ -694,6 +694,17 @@
     }
     return false;
   }
+
+  /**
+   * @param hri
+   * @return True if the passed region is assignable: i.e. not assigned, not
+   * pending and not unassigned.
+   */
+  public boolean assignable(final HRegionInfo hri) {
+    return !isUnassigned(hri) &&
+    !isPending(hri.getRegionName()) &&
+    !isAssigned(hri.getRegionName());
+  }
   
   /**
    * @param regionName

Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java Wed Jan  7 13:03:20 2009
@@ -59,9 +59,13 @@
   final Map<String, HServerInfo> serversToServerInfo =
     new ConcurrentHashMap<String, HServerInfo>();
   
-  /** Set of known dead servers */
-  final Set<String> deadServers =
-    Collections.synchronizedSet(new HashSet<String>());
+  /**
+   * Set of known dead servers.  On lease expiration, servers are added here.
+   * Boolean holds whether its logs have been split or not.  Initially set to
+   * false.
+   */
+  private final Map<String, Boolean> deadServers =
+    new ConcurrentHashMap<String, Boolean>();
 
   /** SortedMap server load -> Set of server names */
   final SortedMap<HServerLoad, Set<String>> loadToServers =
@@ -89,24 +93,67 @@
     this.loggingPeriodForAverageLoad = master.getConfiguration().
       getLong("hbase.master.avgload.logging.period", 60000);
   }
-  
+ 
+  /*
+   * Look to see if we have ghost references to this regionserver such as
+   * still-existing leases or if regionserver is on the dead servers list
+   * getting its logs processed.
+   * @param serverInfo
+   * @return True if still ghost references and we have not been able to clear
+   * them or the server is shutting down.
+   */
+  private boolean checkForGhostReferences(final HServerInfo serverInfo) {
+    String s = serverInfo.getServerAddress().toString().trim();
+    boolean result = false;
+    boolean lease = false;
+    for (long sleepTime = -1; !master.closed.get() && !result;) {
+      if (sleepTime != -1) {
+        try {
+          Thread.sleep(sleepTime);
+        } catch (InterruptedException e) {
+          // Continue
+        }
+      }
+      if (!lease) {
+        try {
+          this.serverLeases.createLease(s, new ServerExpirer(s));
+        } catch (Leases.LeaseStillHeldException e) {
+          LOG.debug("Waiting on current lease to expire for " + e.getName());
+          sleepTime = this.master.leaseTimeout / 4;
+          continue;
+        }
+        lease = true;
+      }
+      // May be on list of dead servers.  If so, wait till we've cleared it.
+      String addr = serverInfo.getServerAddress().toString();
+      if (isDead(addr) && !isDeadServerLogsSplit(addr)) {
+        LOG.debug("Waiting on " + addr + " removal from dead list before " +
+          "processing report-for-duty request");
+        sleepTime = this.master.threadWakeFrequency;
+        try {
+          // Keep up lease.  May be here > lease expiration.
+          this.serverLeases.renewLease(s);
+        } catch (LeaseException e) {
+          LOG.warn("Failed renewal. Retrying.", e);
+        }
+        continue;
+      }
+      result = true;
+    }
+    return result;
+  }
+
   /**
    * Let the server manager know a new regionserver has come online
    * @param serverInfo
    */
-  public void regionServerStartup(HServerInfo serverInfo) {
+  public void regionServerStartup(final HServerInfo serverInfo) {
     String s = serverInfo.getServerAddress().toString().trim();
     LOG.info("Received start message from: " + s);
-    // Do the lease check up here. There might already be one out on this
-    // server expecially if it just shutdown and came back up near-immediately.
-    if (!master.closed.get()) {
-      try {
-        serverLeases.createLease(s, new ServerExpirer(s));
-      } catch (Leases.LeaseStillHeldException e) {
-        LOG.debug("Lease still held on " + e.getName());
-        return;
-      }
+    if (!checkForGhostReferences(serverInfo)) {
+      return;
     }
+    // Go on to process the regionserver registration.
     HServerLoad load = serversToLoad.remove(s);
     if (load != null) {
       // The startup message was from a known server.
@@ -119,7 +166,6 @@
         }
       }
     }
-
     HServerInfo storedInfo = serversToServerInfo.remove(s);
     if (storedInfo != null && !master.closed.get()) {
       // The startup message was from a known server with the same name.
@@ -137,7 +183,6 @@
         LOG.error("Insertion into toDoQueue was interrupted", e);
       }
     }
-
     // record new server
     load = new HServerLoad();
     serverInfo.setLoad(load);
@@ -703,7 +748,7 @@
             }
           }
         }
-        deadServers.add(server);
+        deadServers.put(server, Boolean.FALSE);
         try {
           master.toDoQueue.put(
               new ProcessServerShutdown(master, info, rootServer));
@@ -742,6 +787,23 @@
    * @return true if server is dead
    */
   public boolean isDead(String serverName) {
-    return deadServers.contains(serverName);
+    return deadServers.containsKey(serverName);
+  }
+
+  /**
+   * @param serverName
+   * @return True if this is a dead server and it has had its logs split.
+   */
+  public boolean isDeadServerLogsSplit(final String serverName) {
+    Boolean b = this.deadServers.get(serverName);
+    return b == null? false: b.booleanValue();
+  }
+
+  /**
+   * Set that this deadserver has had its log split.
+   * @param serverName
+   */
+  public void setDeadServerLogsSplit(final String serverName) {
+    this.deadServers.put(serverName, Boolean.TRUE);
   }
-}
\ No newline at end of file
+}