You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2010/12/05 05:49:43 UTC

svn commit: r1042282 - in /hbase/trunk/src: main/java/org/apache/hadoop/hbase/master/ main/java/org/apache/hadoop/hbase/zookeeper/ test/java/org/apache/hadoop/hbase/master/

Author: stack
Date: Sun Dec  5 04:49:43 2010
New Revision: 1042282

URL: http://svn.apache.org/viewvc?rev=1042282&view=rev
Log:
HBASE-3311 Addendum patch on HBASE-3298 broke build

Modified:
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
    hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java?rev=1042282&r1=1042281&r2=1042282&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java Sun Dec  5 04:49:43 2010
@@ -126,13 +126,14 @@ class ActiveMasterManager extends ZooKee
           this.watcher.masterAddressZNode, this.address)) {
         // We are the master, return
         this.clusterHasActiveMaster.set(true);
+        LOG.info("Master=" + this.address);
         return cleanSetOfActiveMaster;
       }
+      cleanSetOfActiveMaster = false;
 
       // There is another active master running elsewhere or this is a restart
       // and the master ephemeral node has not expired yet.
       this.clusterHasActiveMaster.set(true);
-      cleanSetOfActiveMaster = false;
       HServerAddress currentMaster =
         ZKUtil.getDataAsAddress(this.watcher, this.watcher.masterAddressZNode);
       if (currentMaster != null && currentMaster.equals(this.address)) {

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1042282&r1=1042281&r2=1042282&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Sun Dec  5 04:49:43 2010
@@ -25,7 +25,6 @@ import java.io.IOException;
 import java.net.ConnectException;
 import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -55,6 +54,7 @@ import org.apache.hadoop.hbase.client.Re
 import org.apache.hadoop.hbase.executor.EventHandler.EventType;
 import org.apache.hadoop.hbase.executor.ExecutorService;
 import org.apache.hadoop.hbase.executor.RegionTransitionData;
+import org.apache.hadoop.hbase.master.AssignmentManager.RegionState;
 import org.apache.hadoop.hbase.master.LoadBalancer.RegionPlan;
 import org.apache.hadoop.hbase.master.handler.ClosedRegionHandler;
 import org.apache.hadoop.hbase.master.handler.OpenedRegionHandler;
@@ -333,6 +333,10 @@ public class AssignmentManager extends Z
    */
   private void handleRegion(final RegionTransitionData data) {
     synchronized(regionsInTransition) {
+      if (data == null || data.getServerName() == null) {
+        LOG.warn("Unexpected NULL input " + data);
+        return;
+      }
       // Verify this is a known server
       if (!serverManager.isServerOnline(data.getServerName()) &&
           !this.master.getServerName().equals(data.getServerName())) {
@@ -1039,47 +1043,39 @@ public class AssignmentManager extends Z
           region.getRegionNameAsString());
         return;
       }
-      LOG.debug("Server " + server + " region CLOSE RPC returned false");
+      // This never happens. Currently regionserver close always return true.
+      LOG.debug("Server " + server + " region CLOSE RPC returned false for " +
+        region.getEncodedName());
     } catch (NotServingRegionException nsre) {
-      // Failed to close, so pass through and reassign
       LOG.info("Server " + server + " returned " + nsre + " for " +
         region.getEncodedName());
+      // Presume that master has stale data.  Presume remote side just split.
+      // Presume that the split message when it comes in will fix up the master's
+      // in memory cluster state.
+      return;
     } catch (ConnectException e) {
-      // Failed to connect, so pass through and reassign
+      LOG.info("Failed connect to " + server + ", message=" + e.getMessage() +
+        ", region=" + region.getEncodedName());
+      // Presume that regionserver just failed and we haven't got expired
+      // server from zk yet.  Let expired server deal with clean up.
+    } catch (java.net.SocketTimeoutException e) {
       LOG.info("Server " + server + " returned " + e.getMessage() + " for " +
         region.getEncodedName());
-    } catch (java.net.SocketTimeoutException e) {
-      // Failed to connect, so pass through and reassign
-      LOG.info("Server " + server + " returned " + e.getMessage());
+      // Presume retry or server will expire.
     } catch (RemoteException re) {
-      if (re.unwrapRemoteException() instanceof NotServingRegionException) {
+      IOException ioe = re.unwrapRemoteException();
+      if (ioe instanceof NotServingRegionException) {
         // Failed to close, so pass through and reassign
-        LOG.debug("Server " + server + " returned NotServingRegionException");
+        LOG.debug("Server " + server + " returned " + ioe + " for " +
+          region.getEncodedName());
       } else {
-        this.master.abort("Remote unexpected exception",
-          re.unwrapRemoteException());
+        this.master.abort("Remote unexpected exception", ioe);
       }
     } catch (Throwable t) {
       // For now call abort if unexpected exception -- radical, but will get
       // fellas attention. St.Ack 20101012
       this.master.abort("Remote unexpected exception", t);
     }
-    /* This looks way wrong at least for the case where close failed because
-     * it was being concurrently split.  It also looks wrong for case where
-     * we cannot connect to remote server.  In that case, let the server
-     * expiration do the fixup.  I'm leaving this code here commented out for
-     * the moment in case I've missed something and this code is actually needed.
-     * St.Ack 12/04/2010.
-     * 
-    // Did not CLOSE, so set region offline and assign it
-    LOG.debug("Attempted to send CLOSE to " + server +
-      " for region " + region.getRegionNameAsString() + " but failed, " +
-      "setting region as OFFLINE and reassigning");
-    synchronized (regionsInTransition) {
-      forceRegionStateToOffline(region);
-    }
-    assign(region, true);
-    */
   }
 
   /**
@@ -1555,8 +1551,8 @@ public class AssignmentManager extends Z
                   // Attempt to transition node into OFFLINE
                   try {
                     data = new RegionTransitionData(
-                        EventType.M_ZK_REGION_OFFLINE,
-                        regionInfo.getRegionName());
+                      EventType.M_ZK_REGION_OFFLINE, regionInfo.getRegionName(),
+                      master.getServerName());
                     if (ZKUtil.setData(watcher, node, data.getBytes(),
                         stat.getVersion())) {
                       // Node is now OFFLINE, let's trigger another assignment

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1042282&r1=1042281&r2=1042282&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Sun Dec  5 04:49:43 2010
@@ -156,7 +156,7 @@ implements HMasterInterface, HMasterRegi
   // flag set after we become the active master (used for testing)
   private volatile boolean isActiveMaster = false;
   // flag set after we complete initialization once active (used for testing)
-  private volatile boolean isInitialized = false;
+  private volatile boolean initialized = false;
 
   // Instance of the hbase executor service.
   ExecutorService executorService;
@@ -400,7 +400,7 @@ implements HMasterInterface, HMasterRegi
       Threads.setDaemonThreadRunning(new CatalogJanitor(this, this));
 
     LOG.info("Master has completed initialization");
-    isInitialized = true;
+    initialized = true;
   }
 
   /**
@@ -998,7 +998,7 @@ implements HMasterInterface, HMasterRegi
    * @return true if master is ready to go, false if not.
    */
   public boolean isInitialized() {
-    return isInitialized;
+    return initialized;
   }
 
   @Override

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1042282&r1=1042281&r2=1042282&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Sun Dec  5 04:49:43 2010
@@ -578,17 +578,13 @@ public class ServerManager {
    */
   public boolean sendRegionClose(HServerInfo server, HRegionInfo region)
   throws IOException {
-    if (server == null) {
-      LOG.debug("Unable to send region close because server is null; region=" +
-          region.getRegionNameAsString());
-      return false;
-    }
+    if (server == null) throw new NullPointerException("Passed server is null");
     HRegionInterface hri = getServerConnection(server);
-    if(hri == null) {
-      LOG.warn("Attempting to send CLOSE RPC to server " +
-        server.getServerName() + " for region " + region.getRegionNameAsString()
-        + " failed because no RPC connection found to this server");
-      return false;
+    if (hri == null) {
+      throw new IOException("Attempting to send CLOSE RPC to server " +
+        server.getServerName() + " for region " +
+        region.getRegionNameAsString() +
+        " failed because no RPC connection found to this server");
     }
     return hri.closeRegion(region);
   }

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java?rev=1042282&r1=1042281&r2=1042282&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java Sun Dec  5 04:49:43 2010
@@ -294,8 +294,6 @@ public class ZooKeeperWatcher implements
   private void connectionEvent(WatchedEvent event) {
     switch(event.getState()) {
       case SyncConnected:
-        // Update our identifier.  Otherwise ignore.
-        LOG.debug(this.identifier + " connected");
         // Now, this callback can be invoked before the this.zookeeper is set.
         // Wait a little while.
         long finished = System.currentTimeMillis() +
@@ -312,6 +310,8 @@ public class ZooKeeperWatcher implements
         }
         this.identifier = this.identifier + "-0x" +
           Long.toHexString(this.zooKeeper.getSessionId());
+        // Update our identifier.  Otherwise ignore.
+        LOG.debug(this.identifier + " connected");
         break;
 
       // Abort the server if Disconnected or Expired

Modified: hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java?rev=1042282&r1=1042281&r2=1042282&view=diff
==============================================================================
--- hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java (original)
+++ hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java Sun Dec  5 04:49:43 2010
@@ -230,12 +230,11 @@ public class TestMasterFailover {
 
     // Create a ZKW to use in the test
     ZooKeeperWatcher zkw = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
-        "unittest", new Abortable() {
-          @Override
-          public void abort(String why, Throwable e) {
-            LOG.error("Fatal ZK Error: " + why, e);
-            org.junit.Assert.assertFalse("Fatal ZK error", true);
-          }
+      "unittest", new Abortable() {
+        @Override
+        public void abort(String why, Throwable e) {
+          throw new RuntimeException("Fatal ZK error, why=" + why, e);
+        }
     });
 
     // get all the master threads
@@ -820,22 +819,40 @@ public class TestMasterFailover {
     master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
         new RegionState(region, RegionState.State.PENDING_OPEN, 0));
     ZKAssign.createNodeOffline(zkw, region, master.getServerName());
+    // This test is bad.  It puts up a PENDING_CLOSE but doesn't say what
+    // server we were PENDING_CLOSE against -- i.e. an entry in
+    // AssignmentManager#regions.  W/o a server, we NPE trying to resend close.
+    // In past, there was wonky logic that had us reassign region if no server
+    // at tail of the unassign.  This was removed.  Commenting out for now.
+    // TODO: Remove completely.
+    /*
     // PENDING_CLOSE and enabled
     region = enabledRegions.remove(0);
+    LOG.info("Setting PENDING_CLOSE enabled " + region.getEncodedName());
     regionsThatShouldBeOnline.add(region);
     master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
-        new RegionState(region, RegionState.State.PENDING_CLOSE, 0));
+      new RegionState(region, RegionState.State.PENDING_CLOSE, 0));
     // PENDING_CLOSE and disabled
     region = disabledRegions.remove(0);
+    LOG.info("Setting PENDING_CLOSE disabled " + region.getEncodedName());
     regionsThatShouldBeOffline.add(region);
     master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
-        new RegionState(region, RegionState.State.PENDING_CLOSE, 0));
+      new RegionState(region, RegionState.State.PENDING_CLOSE, 0));
+      */
 
     // Failover should be completed, now wait for no RIT
     log("Waiting for no more RIT");
     ZKAssign.blockUntilNoRIT(zkw);
     log("No more RIT in ZK");
-    master.assignmentManager.waitUntilNoRegionsInTransition(120000);
+    long now = System.currentTimeMillis();
+    final long maxTime = 120000;
+    boolean done = master.assignmentManager.waitUntilNoRegionsInTransition(maxTime);
+    if (!done) {
+      LOG.info("rit=" + master.assignmentManager.getRegionsInTransition());
+    }
+    long elapsed = System.currentTimeMillis() - now;
+    assertTrue("Elapsed=" + elapsed + ", maxTime=" + maxTime + ", done=" + done,
+      elapsed < maxTime);
     log("No more RIT in RIT map, doing final test verification");
 
     // Grab all the regions that are online across RSs
@@ -847,7 +864,7 @@ public class TestMasterFailover {
 
     // Now, everything that should be online should be online
     for (HRegionInfo hri : regionsThatShouldBeOnline) {
-      assertTrue(onlineRegions.contains(hri));
+      assertTrue("region=" + hri.getRegionNameAsString(), onlineRegions.contains(hri));
     }
 
     // Everything that should be offline should not be online