You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ns...@apache.org on 2011/10/25 00:30:49 UTC

svn commit: r1188421 - in /hbase/branches/0.89-fb/src: main/java/org/apache/hadoop/hbase/ main/java/org/apache/hadoop/hbase/master/ main/java/org/apache/hadoop/hbase/regionserver/ main/java/org/apache/hadoop/hbase/replication/master/ main/java/org/apac...

Author: nspiegelberg
Date: Mon Oct 24 22:30:48 2011
New Revision: 1188421

URL: http://svn.apache.org/viewvc?rev=1188421&view=rev
Log:
First version of TestMasterFailover for hbase-89

**** 89 MASTER ONLY ****

Summary: An initial unit test with multiple masters for hbase-89. This is only
one test from the open source trunk's TestMasterFailover. Multiple changes are
being made in master code to be able to "kill" a master within a unit test
without shutting down the cluster.

Test Plan: Unit tests (including the new one), dev cluster.
Reviewers: kannan, kranganathan, nspiegelberg
Reviewed By: kannan
CC: hbase-eng@lists, mbautin, kannan, pkhemani

Differential Revision: 341948
Revert Plan: OK
Task ID: 558162

Added:
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
Modified:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Chore.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKUnassignedWatcher.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Chore.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Chore.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Chore.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Chore.java Mon Oct 24 22:30:48 2011
@@ -38,9 +38,22 @@ import org.apache.hadoop.hbase.util.Slee
 public abstract class Chore extends Thread {
   private final Log LOG = LogFactory.getLog(this.getClass());
   private final Sleeper sleeper;
+
+  /**
+   * This variable might belong to someone else, e.g. HMaster. Setting this
+   * variable might trigger cluster shutdown. To shut down this thread only,
+   * use {@link #threadStop}.
+   */
   protected volatile AtomicBoolean stop;
 
   /**
+   * Unlike {@link #stop}, this indicates that the current thread should shut
+   * down. We use this flag when the master requests a shutdown of base
+   * scanners, but we don't want to shut down the whole cluster.
+   */
+  private volatile boolean threadStop = false;
+
+  /**
    * @param p Period at which we should run.  Will be adjusted appropriately
    * should we find work and it takes time to complete.
    * @param s When this flag is set to true, this thread will cleanup and exit
@@ -59,7 +72,7 @@ public abstract class Chore extends Thre
   public void run() {
     try {
       boolean initialChoreComplete = false;
-      while (!this.stop.get()) {
+      while (!this.stop.get() && !threadStop) {
         long startTime = System.currentTimeMillis();
         try {
           if (!initialChoreComplete) {
@@ -112,4 +125,9 @@ public abstract class Chore extends Thre
   protected void sleep() {
     this.sleeper.sleep();
   }
+
+  protected void shutdownThisThread() {
+    threadStop = true;
+  }
+
 }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java Mon Oct 24 22:30:48 2011
@@ -56,8 +56,8 @@ import org.apache.hadoop.hbase.util.JVMC
  */
 public class LocalHBaseCluster {
   static final Log LOG = LogFactory.getLog(LocalHBaseCluster.class);
-  private final List<JVMClusterUtil.MasterThread> masterThreads =
-    new CopyOnWriteArrayList<JVMClusterUtil.MasterThread>();
+  private final List<HMaster> masters =
+    new CopyOnWriteArrayList<HMaster>();
   private final List<JVMClusterUtil.RegionServerThread> regionThreads =
     new CopyOnWriteArrayList<JVMClusterUtil.RegionServerThread>();
   private final static int DEFAULT_NO = 1;
@@ -147,20 +147,18 @@ public class LocalHBaseCluster {
     return rst;
   }
 
-  public JVMClusterUtil.MasterThread addMaster() throws IOException {
-    return addMaster(new Configuration(conf), this.masterThreads.size());
+  public HMaster addMaster() throws IOException {
+    return addMaster(new Configuration(conf), this.masters.size());
   }
 
-  public JVMClusterUtil.MasterThread addMaster(Configuration c, final int index)
+  public HMaster addMaster(Configuration c, final int index)
   throws IOException {
     // Create each master with its own Configuration instance so each has
     // its HConnection instance rather than share (see HBASE_INSTANCES down in
     // the guts of HConnectionManager.
-    JVMClusterUtil.MasterThread mt =
-      JVMClusterUtil.createMasterThread(c,
-        this.masterClass, index);
-    this.masterThreads.add(mt);
-    return mt;
+    HMaster m = JVMClusterUtil.createMaster(c, this.masterClass, index);
+    this.masters.add(m);
+    return m;
   }
 
   /**
@@ -175,10 +173,41 @@ public class LocalHBaseCluster {
    * @return the HMaster thread
    */
   public HMaster getMaster() {
-    if (masterThreads.size() != 1) {
+    if (masters.size() != 1) {
       throw new AssertionError("one master expected");
     }
-    return this.masterThreads.get(0).getMaster();
+    return this.masters.get(0);
+  }
+
+  /**
+   * @return Read-only list of master threads.
+   */
+  public List<HMaster> getMasters() {
+    return Collections.unmodifiableList(this.masters);
+  }
+
+  /**
+   * Wait for the specified master to stop
+   * Removes this thread from list of running threads.
+   * @param serverNumber
+   * @return Name of master that just went down.
+   */
+  public String waitOnMasterStop(int serverNumber) {
+    HMaster master = masters.remove(serverNumber);
+    boolean interrupted = false;
+    while (master.isAlive()) {
+      try {
+        LOG.info("Waiting on " +
+          master.getServerName().toString());
+        master.join();
+      } catch (InterruptedException e) {
+        interrupted = true;
+      }
+    }
+    if (interrupted) {
+      Thread.currentThread().interrupt();
+    }
+    return master.getName();
   }
 
   /**
@@ -241,8 +270,8 @@ public class LocalHBaseCluster {
         }
       }
     }
-    if (this.masterThreads != null) {
-      for (Thread t : this.masterThreads) {
+    if (this.masters != null) {
+      for (Thread t : this.masters) {
         if (t.isAlive()) {
           try {
             t.join();
@@ -258,14 +287,14 @@ public class LocalHBaseCluster {
    * Start the cluster.
    */
   public void startup() throws IOException {
-    JVMClusterUtil.startup(this.masterThreads, this.regionThreads);
+    JVMClusterUtil.startup(this.masters, this.regionThreads);
   }
 
   /**
    * Shut down the mini HBase cluster
    */
   public void shutdown() {
-    JVMClusterUtil.shutdown(this.masterThreads, this.regionThreads);
+    JVMClusterUtil.shutdown(this.masters, this.regionThreads);
   }
 
   /**

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java Mon Oct 24 22:30:48 2011
@@ -611,6 +611,7 @@ abstract class BaseScanner extends Chore
   public void interruptAndStop() {
     synchronized(scannerLock){
       if (isAlive()) {
+        shutdownThisThread();
         super.interrupt();
         LOG.info("Interrupted");
       }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Mon Oct 24 22:30:48 2011
@@ -133,13 +133,24 @@ public class HMaster extends Thread impl
   private static final Log LOG = LogFactory.getLog(HMaster.class.getName());
   private static final String LOCALITY_SNAPSHOT_FILE_NAME = "regionLocality-snapshot";
 
-  // We start out with closed flag on.  Its set to off after construction.
-  // Use AtomicBoolean rather than plain boolean because we want other threads
-  // able to set shutdown flag.  Using AtomicBoolean can pass a reference
-  // rather than have them have to know about the hosting Master class.
-  final AtomicBoolean closed = new AtomicBoolean(true);
-  // TODO: Is this separate flag necessary?
-  private final AtomicBoolean shutdownRequested = new AtomicBoolean(false);
+  /**
+   * We start out with closed flag on. Its set to off after construction. Use
+   * AtomicBoolean rather than plain boolean because we want other threads able
+   * to set this flag and trigger a cluster shutdown. Using AtomicBoolean can
+   * pass a reference rather than have them have to know about the hosting
+   * Master class. This also has disadvantages, because this instance passed
+   * to another object can be confused with another thread's own shutdown flag.
+   */
+  private final AtomicBoolean closed = new AtomicBoolean(true);
+
+  /**
+   * This flag indicates that cluster shutdown has been requested. This is
+   * different from {@link #closed} in that it is initially false, but is
+   * set to true at shutdown and remains true from then on. For killing one
+   * instance of the master, see {@link #killed}.
+   */
+  private final AtomicBoolean clusterShutdownRequested =
+      new AtomicBoolean(false);
 
   private final Configuration conf;
   private final Path rootdir;
@@ -148,7 +159,7 @@ public class HMaster extends Thread impl
   private final int numRetries;
 
   // Metrics is set when we call run.
-  private final MasterMetrics metrics;
+  private MasterMetrics metrics;
 
   final Lock splitLogLock = new ReentrantLock();
   final boolean distributedLogSplitting;
@@ -171,12 +182,12 @@ public class HMaster extends Thread impl
   private final HServerAddress address;
 
   private final ServerConnection connection;
-  private final ServerManager serverManager;
-  private final RegionManager regionManager;
+  private ServerManager serverManager;
+  private RegionManager regionManager;
 
   private long lastFragmentationQuery = -1L;
   private Map<String, Integer> fragmentation = null;
-  private final RegionServerOperationQueue regionServerOperationQueue;
+  private RegionServerOperationQueue regionServerOperationQueue;
 
   // True if this is the master that started the cluster.
   boolean isClusterStartup;
@@ -187,9 +198,14 @@ public class HMaster extends Thread impl
   private long applyPreferredAssignmentPeriod = 0l;
   private long holdRegionForBestLocalityPeriod = 0l;
 
-  // flag set after we become the active master (used for testing)
+  /** True if the master is being killed. No cluster shutdown is done. */
+  private volatile boolean killed = false;
+
+  /** Flag set after we become the active master (used for testing). */
   private volatile boolean isActiveMaster = false;
 
+  public static final String MASTER_ID_CONF_KEY = "hbase.test.master.id";
+
   /**
    * Constructor
    * @param conf configuration
@@ -202,7 +218,8 @@ public class HMaster extends Thread impl
     // number of RS ephemeral nodes. RS ephemeral nodes are created only after
     // the primary master has written the address to ZK. So this has to be done
     // before we race to write our address to zookeeper.
-    zooKeeperWrapper = ZooKeeperWrapper.createInstance(conf, HMaster.class.getName());
+    zooKeeperWrapper = ZooKeeperWrapper.createInstance(conf,
+        getZKWrapperName());
     isClusterStartup = (zooKeeperWrapper.scanRSDirectory().size() == 0);
 
     // Get my address and create an rpc server instance.  The rpc-server port
@@ -254,12 +271,12 @@ public class HMaster extends Thread impl
     // TODO: Bring up the UI to redirect to active Master.
     zooKeeperWrapper.registerListener(this);
     this.zkMasterAddressWatcher =
-      new ZKMasterAddressWatcher(this.zooKeeperWrapper, this.shutdownRequested);
+      new ZKMasterAddressWatcher(this.zooKeeperWrapper, this.clusterShutdownRequested);
     zooKeeperWrapper.registerListener(zkMasterAddressWatcher);
 
     // if we're a backup master, stall until a primary to writes his address
-    if (conf.getBoolean(HConstants.MASTER_TYPE_BACKUP, HConstants.DEFAULT_MASTER_TYPE_BACKUP)) {
-
+    if (conf.getBoolean(HConstants.MASTER_TYPE_BACKUP,
+        HConstants.DEFAULT_MASTER_TYPE_BACKUP)) {
       // ephemeral node expiry will be detected between about 40 to 60 seconds;
       // plus add a little extra since only ZK leader can expire nodes, and
       // leader maybe a little  bit delayed in getting info about the pings.
@@ -275,26 +292,46 @@ public class HMaster extends Thread impl
         throw new IOException("Interrupted waiting for master address");
       }
     }
+  }
+
+  private void stallIfBackupMaster() {
+    if (!this.zkMasterAddressWatcher.writeAddressToZooKeeper(this.address,
+        true)) {
+      LOG.info("Failed to write master address to ZooKeeper, not starting (" +
+          "closed=" + closed.get() + ")");
+      return;
+    }
 
-    this.zkMasterAddressWatcher.writeAddressToZooKeeper(this.address, true);
     isActiveMaster = true;
     this.regionServerOperationQueue =
       new RegionServerOperationQueue(this.conf, this.closed);
 
     serverManager = new ServerManager(this);
 
-
     // Start the unassigned watcher - which will create the unassigned region
     // in ZK. This is needed before RegionManager() constructor tries to assign
     // the root region.
-    ZKUnassignedWatcher.start(this.conf, this);
+    try {
+      ZKUnassignedWatcher.start(this.conf, this);
+    } catch (IOException e) {
+      LOG.error("Failed to start ZK unassigned region watcher", e);
+      throw new RuntimeException(e);
+    }
+
     // start the "close region" executor service
-    HBaseEventType.RS2ZK_REGION_CLOSED.startMasterExecutorService(address.toString());
+    HBaseEventType.RS2ZK_REGION_CLOSED.startMasterExecutorService(
+        address.toString());
     // start the "open region" executor service
-    HBaseEventType.RS2ZK_REGION_OPENED.startMasterExecutorService(address.toString());
+    HBaseEventType.RS2ZK_REGION_OPENED.startMasterExecutorService(
+        address.toString());
 
     // start the region manager
-    regionManager = new RegionManager(this);
+    try {
+      regionManager = new RegionManager(this);
+    } catch (IOException e) {
+      LOG.error("Failed to instantiate region manager");
+      throw new RuntimeException(e);
+    }
 
     setName(MASTER);
     this.metrics = new MasterMetrics(MASTER, this.serverManager);
@@ -499,8 +536,8 @@ public class HMaster extends Thread impl
     return this.fs;
   }
 
-  public AtomicBoolean getShutdownRequested() {
-    return this.shutdownRequested;
+  public AtomicBoolean getClusterShutdownRequested() {
+    return this.clusterShutdownRequested;
   }
 
   AtomicBoolean getClosed() {
@@ -562,6 +599,12 @@ public class HMaster extends Thread impl
   /** Main processing loop */
   @Override
   public void run() {
+    stallIfBackupMaster();
+
+    if (closed.get()) {
+      LOG.info("Master is closing, not starting the main loop");
+      return;
+    }
     MonitoredTask startupStatus =
       TaskMonitor.get().createStatus("Master startup");
     startupStatus.setDescription("Master startup");
@@ -581,7 +624,7 @@ public class HMaster extends Thread impl
       /* Main processing loop */
       FINISHED: while (!this.closed.get()) {
         // check if we should be shutting down
-        if (this.shutdownRequested.get()) {
+        if (clusterShutdownRequested.get()) {
           // The region servers won't all exit until we stop scanning the
           // meta regions
           this.regionManager.stopScanners();
@@ -613,14 +656,26 @@ public class HMaster extends Thread impl
     }
 
     startupStatus.cleanup();
-    if (!this.shutdownRequested.get()) {  // shutdown not by request
-      shutdown();  // indicated that master is shutting down
-      startShutdown();  // get started with shutdown: stop scanners etc.
+    if (!this.clusterShutdownRequested.get()) {  // shutdown not by request
+      if (!killed) {
+        // This would trigger a cluster shutdown, so we are not doing this when
+        // the master is being "killed" in a unit test.
+        shutdown();
+      }
+
+      // Get started with shutdown: stop scanners, etc. This does not lead to
+      // a cluster shutdown.
+      startShutdown();
+    } else if (killed) {
+      startShutdown();
+    }
+
+    if (!killed) {
+      // Wait for all the remaining region servers to report in. Only doing
+      // this when the cluster is shutting down.
+      this.serverManager.letRegionServersShutdown();
     }
 
-    // Wait for all the remaining region servers to report in.
-    this.serverManager.letRegionServersShutdown();
-
     /*
      * Clean up and close up shop
      */
@@ -633,7 +688,15 @@ public class HMaster extends Thread impl
       }
     }
     this.rpcServer.stop();
-    this.regionManager.stop();
+
+    if (killed) {
+      regionManager.joinThreads();
+    } else {
+      // Compared to the above, this will clear the RS directory. We are not
+      // doing that when the master is being "killed" in a unit test.
+      regionManager.stop();
+    }
+
     this.zooKeeperWrapper.close();
     HBaseExecutorService.shutdown();
     LOG.info("HMaster main thread exiting");
@@ -965,7 +1028,7 @@ public class HMaster extends Thread impl
         // splitLogManager must be started before starting rpcServer because
         // region-servers dying will trigger log splitting
         this.splitLogManager = new SplitLogManager(zooKeeperWrapper, conf,
-            this.shutdownRequested, address.toString());
+            this.clusterShutdownRequested, address.toString());
         this.splitLogManager.finishInitialization();
       }
       // Start the server so that region servers are running before we start
@@ -991,7 +1054,7 @@ public class HMaster extends Thread impl
   }
 
   /*
-   * Start shutting down the master
+   * Start shutting down the master. This does NOT trigger a cluster shutdown.
    */
   void startShutdown() {
     this.closed.set(true);
@@ -1059,7 +1122,7 @@ public class HMaster extends Thread impl
   @Override
   public void shutdown() {
     LOG.info("Cluster shutdown requested. Starting to quiesce servers");
-    this.shutdownRequested.set(true);
+    this.clusterShutdownRequested.set(true);
     this.zooKeeperWrapper.setClusterState(false);
     if (splitLogManager != null) {
       this.splitLogManager.stop();
@@ -1736,7 +1799,7 @@ public class HMaster extends Thread impl
             cluster.startup();
           } else {
             HMaster master = constructMaster(masterClass, conf);
-            if (master.shutdownRequested.get()) {
+            if (master.clusterShutdownRequested.get()) {
               LOG.info("Won't bring the Master up as a shutdown is requested");
               return;
             }
@@ -1819,7 +1882,25 @@ public class HMaster extends Thread impl
   }
 
   public void stopMaster() {
+    LOG.info("Master stop requested, isActiveMaster=" + isActiveMaster);
     closed.set(true);
+    // If we are a backup master, we need to interrupt wait
+    if (!isActiveMaster) {
+      zkMasterAddressWatcher.cancelMasterZNodeWait();
+    }
+  }
+
+  public void killMaster() {
+    killed = true;
+    stopMaster();
+  }
+
+  boolean isKilled() {
+    return killed;
+  }
+
+  String getZKWrapperName() {
+    return HMaster.class.getName() + conf.get(MASTER_ID_CONF_KEY, "");
   }
 
   public SplitLogManager getSplitLogManager() {

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java Mon Oct 24 22:30:48 2011
@@ -51,7 +51,7 @@ class MetaScanner extends BaseScanner {
    * @param master
    */
   public MetaScanner(HMaster master) {
-    super(master, false, master.getShutdownRequested());
+    super(master, false, master.getClusterShutdownRequested());
   }
 
   // Don't retry if we get an error while scanning. Errors are most often

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java Mon Oct 24 22:30:48 2011
@@ -117,7 +117,7 @@ public class ProcessRegionOpen extends P
       }
       ZooKeeperWrapper zkWrapper =
           ZooKeeperWrapper.getInstance(master.getConfiguration(),
-              HMaster.class.getName());
+              master.getZKWrapperName());
       zkWrapper.deleteUnassignedRegion(regionInfo.getEncodedName());
       return true;
     }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java Mon Oct 24 22:30:48 2011
@@ -201,7 +201,7 @@ public class RegionManager {
 
     this.master = master;
     this.zkWrapper =
-        ZooKeeperWrapper.getInstance(conf, HMaster.class.getName());
+        ZooKeeperWrapper.getInstance(conf, master.getZKWrapperName());
     this.maxAssignInOneGo = conf.getInt("hbase.regions.percheckin", 10);
     this.loadBalancer = new LoadBalancer(conf);
 
@@ -242,7 +242,7 @@ public class RegionManager {
 
   void reassignRootRegion() {
     unsetRootRegion();
-    if (!master.getShutdownRequested().get()) {
+    if (!master.getClusterShutdownRequested().get()) {
       synchronized (regionsInTransition) {
         String regionName = HRegionInfo.ROOT_REGIONINFO.getRegionNameAsString();
         byte[] data = null;
@@ -781,22 +781,29 @@ public class RegionManager {
 
   /** Stop the region assigner */
   public void stop() {
+    joinThreads();
+    master.getZooKeeperWrapper().clearRSDirectory();
+    master.getZooKeeperWrapper().close();
+  }
+
+  /**
+   * Terminate all threads but don't clean up any state.
+   */
+  public void joinThreads() {
     try {
       if (rootScannerThread.isAlive()) {
         rootScannerThread.join();       // Wait for the root scanner to finish.
       }
-    } catch (Exception iex) {
+    } catch (InterruptedException iex) {
       LOG.warn("root scanner", iex);
     }
     try {
       if (metaScannerThread.isAlive()) {
         metaScannerThread.join();       // Wait for meta scanner to finish.
       }
-    } catch(Exception iex) {
+    } catch (InterruptedException iex) {
       LOG.warn("meta scanner", iex);
     }
-    master.getZooKeeperWrapper().clearRSDirectory();
-    master.getZooKeeperWrapper().close();
   }
 
   /**
@@ -1342,7 +1349,7 @@ public class RegionManager {
    */
   public void waitForRootRegionLocation() {
     synchronized (rootRegionLocation) {
-      while (!master.getShutdownRequested().get() &&
+      while (!master.getClusterShutdownRequested().get() &&
           !master.isClosed() && rootRegionLocation.get() == null) {
         // rootRegionLocation will be filled in when we get an 'open region'
         // regionServerReport message from the HRegionServer that has been

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java Mon Oct 24 22:30:48 2011
@@ -31,7 +31,7 @@ class RootScanner extends BaseScanner {
    * @param master
    */
   public RootScanner(HMaster master) {
-    super(master, true, master.getShutdownRequested());
+    super(master, true, master.getClusterShutdownRequested());
   }
 
   /**

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Mon Oct 24 22:30:48 2011
@@ -147,13 +147,13 @@ public class ServerManager {
       60 * 1000);
     this.minimumServerCount = c.getInt("hbase.regions.server.count.min", 0);
     this.serverMonitorThread = new ServerMonitor(metaRescanInterval,
-      this.master.getShutdownRequested());
+      this.master.getClusterShutdownRequested());
     String n = Thread.currentThread().getName();
     Threads.setDaemonThreadRunning(this.serverMonitorThread,
       n + ".serverMonitor");
     this.oldLogCleaner = new OldLogsCleaner(
       c.getInt("hbase.master.meta.thread.rescanfrequency",60 * 1000),
-        this.master.getShutdownRequested(), c,
+        this.master.getClusterShutdownRequested(), c,
         master.getFileSystem(), master.getOldLogDir());
     Threads.setDaemonThreadRunning(oldLogCleaner,
       n + ".oldLogCleaner");
@@ -282,7 +282,7 @@ public class ServerManager {
         this.quiescedServers.incrementAndGet();
       }
     }
-    if (this.master.getShutdownRequested().get()) {
+    if (this.master.getClusterShutdownRequested().get()) {
       if (quiescedServers.get() >= serversToServerInfo.size()) {
         // If the only servers we know about are meta servers, then we can
         // proceed with shutdown
@@ -300,7 +300,7 @@ public class ServerManager {
         return new HMsg [] {HMsg.REGIONSERVER_QUIESCE};
       }
     }
-    if (this.master.isClosed()) {
+    if (this.master.isClosed() && !master.isKilled()) {
       // Tell server to shut down if we are shutting down.  This should
       // happen after check of MSG_REPORT_EXITING above, since region server
       // will send us one of these messages after it gets MSG_REGIONSERVER_STOP
@@ -395,7 +395,7 @@ public class ServerManager {
           ": MSG_REPORT_EXITING");
         // Get all the regions the server was serving reassigned
         // (if we are not shutting down).
-        if (!master.closed.get()) {
+        if (!master.isClosed()) {
           for (int i = 1; i < msgs.length; i++) {
             LOG.info("Processing " + msgs[i] + " from " +
               serverInfo.getServerName());

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java Mon Oct 24 22:30:48 2011
@@ -89,14 +89,16 @@ class ZKMasterAddressWatcher implements 
    * Wait for master address to be available. This sets a watch in ZooKeeper and
    * blocks until the master address ZNode gets deleted.
    */
-  public synchronized void waitForMasterAddressAvailability() {
-    while (zookeeper.readMasterAddress(zookeeper) != null) {
+  private synchronized void waitForMasterAddressAvailability() {
+    while (!requestShutdown.get() &&
+           zookeeper.readMasterAddress(zookeeper) != null) {
       try {
         LOG.debug("Waiting for master address ZNode to be deleted " +
           "(Also watching cluster state node)");
         this.zookeeper.setClusterStateWatch();
         wait();
       } catch (InterruptedException e) {
+        Thread.currentThread().interrupt();
       }
     }
   }
@@ -112,7 +114,7 @@ class ZKMasterAddressWatcher implements 
       waitForMasterAddressAvailability();
       // Check if we need to shutdown instead of taking control
       if (this.requestShutdown.get()) {
-        LOG.debug("Won't start Master because cluster is shuting down");
+        LOG.debug("Won't start Master because of requested shutdown");
         return false;
       }
       if(this.zookeeper.writeMasterAddress(address)) {
@@ -133,4 +135,10 @@ class ZKMasterAddressWatcher implements 
   public void setZookeeper(ZooKeeperWrapper zookeeper) {
     this.zookeeper = zookeeper;
   }
+
+  synchronized void cancelMasterZNodeWait() {
+    requestShutdown.set(true);
+    notifyAll();
+  }
+
 }
\ No newline at end of file

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKUnassignedWatcher.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKUnassignedWatcher.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKUnassignedWatcher.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKUnassignedWatcher.java Mon Oct 24 22:30:48 2011
@@ -56,7 +56,7 @@ public class ZKUnassignedWatcher impleme
   throws IOException {
     this.serverName = master.getHServerAddress().toString();
     this.serverManager = master.getServerManager();
-    zkWrapper = ZooKeeperWrapper.getInstance(conf, HMaster.class.getName());
+    zkWrapper = ZooKeeperWrapper.getInstance(conf, master.getZKWrapperName());
     String unassignedZNode = zkWrapper.getRegionInTransitionZNode();
 
     // If the UNASSIGNED ZNode exists and this is a fresh cluster start, then

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java Mon Oct 24 22:30:48 2011
@@ -199,7 +199,7 @@ public class SplitLogWorker implements R
    * try to grab every task that has been put up
    */
   private void taskLoop() {
-    while (true) {
+    while (!exitWorker) {
       int seq_start = taskReadySeq;
       List<String> paths = getTaskList();
       if (paths == null) {
@@ -213,7 +213,7 @@ public class SplitLogWorker implements R
         // don't call ZKSplitLog.getNodeName() because that will lead to
         // double encoding of the path name
         grabTask(watcher.getZNode(watcher.splitLogZNode, paths.get(idx)));
-        if (exitWorker == true) {
+        if (exitWorker) {
           return;
         }
       }
@@ -222,9 +222,10 @@ public class SplitLogWorker implements R
           try {
             taskReadyLock.wait();
           } catch (InterruptedException e) {
-            LOG.warn("SplitLogWorker inteurrpted while waiting for task," +
-                " exiting", e);
-            assert exitWorker == true;
+            LOG.info("SplitLogWorker interrupted while waiting for task," +
+                " exiting: " + e.toString() + (exitWorker ? "" :
+                " (ERROR: exitWorker is not set, exiting anyway)"));
+            exitWorker = true;
             return;
           }
         }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java Mon Oct 24 22:30:48 2011
@@ -49,11 +49,14 @@ public class ReplicationLogCleaner imple
   private Configuration conf;
   private ReplicationZookeeperWrapper zkHelper;
   private Set<String> hlogs = new HashSet<String>();
+  private final String zkWrapperName;
 
   /**
    * Instantiates the cleaner, does nothing more.
    */
-  public ReplicationLogCleaner() {}
+  public ReplicationLogCleaner(String zkWrapperName) {
+    this.zkWrapperName = zkWrapperName;
+  }
 
   @Override
   public boolean isLogDeletable(Path filePath) {
@@ -121,8 +124,7 @@ public class ReplicationLogCleaner imple
     this.ttlCleaner.setConf(conf);
     try {
       this.zkHelper = new ReplicationZookeeperWrapper(
-          ZooKeeperWrapper.createInstance(this.conf,
-              HMaster.class.getName()),
+          ZooKeeperWrapper.createInstance(this.conf, zkWrapperName),
           this.conf, new AtomicBoolean(true), null);
     } catch (IOException e) {
       LOG.error(e);

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java Mon Oct 24 22:30:48 2011
@@ -96,41 +96,7 @@ public class JVMClusterUtil {
   }
 
   /**
-   * Datastructure to hold Master Thread and Master instance
-   */
-  public static class MasterThread extends Thread {
-    private final HMaster master;
-
-    public MasterThread(final HMaster m, final int index) {
-      super(m, "Master:" + index);
-      this.master = m;
-    }
-
-    /** @return the master */
-    public HMaster getMaster() {
-      return this.master;
-    }
-
-    /**
-     * Block until the master has come online, indicating it is ready
-     * to be used.
-     */
-    public void waitForServerOnline() {
-      // The server is marked online after init begins but before race to become
-      // the active master.
-      while (!this.master.isMasterRunning() &&
-          !this.master.getShutdownRequested().get()) {
-        try {
-          Thread.sleep(1000);
-        } catch (InterruptedException e) {
-          // continue waiting
-        }
-      }
-    }
-  }
-
-  /**
-   * Creates a {@link MasterThread}.
+   * Creates a {@link HMaster}.
    * Call 'start' on the returned thread to make it run.
    * @param c Configuration to use.
    * @param hmc Class to create.
@@ -138,13 +104,16 @@ public class JVMClusterUtil {
    * @throws IOException
    * @return Master added.
    */
-  public static JVMClusterUtil.MasterThread createMasterThread(
+  public static HMaster createMaster(
       final Configuration c, final Class<? extends HMaster> hmc,
       final int index)
   throws IOException {
-    HMaster server;
+    Configuration masterConf = new Configuration(c);
+    masterConf.setInt(HMaster.MASTER_ID_CONF_KEY, index);
+
+    HMaster master;
     try {
-      server = hmc.getConstructor(Configuration.class).newInstance(c);
+      master = hmc.getConstructor(Configuration.class).newInstance(masterConf);
     } catch (InvocationTargetException ite) {
       Throwable target = ite.getTargetException();
       throw new RuntimeException("Failed construction of Master: " +
@@ -155,7 +124,7 @@ public class JVMClusterUtil {
       ioe.initCause(e);
       throw ioe;
     }
-    return new JVMClusterUtil.MasterThread(server, index);
+    return master;
   }
 
   /**
@@ -164,10 +133,10 @@ public class JVMClusterUtil {
    * @param regionServers
    * @return Address to use contacting master.
    */
-  public static String startup(final List<JVMClusterUtil.MasterThread> masters,
+  public static String startup(final List<HMaster> masters,
       final List<JVMClusterUtil.RegionServerThread> regionservers) throws IOException {
     if (masters != null) {
-      for (JVMClusterUtil.MasterThread t : masters) {
+      for (HMaster t : masters) {
         t.start();
       }
     }
@@ -181,9 +150,9 @@ public class JVMClusterUtil {
     }
     // Wait for an active master
     while (true) {
-      for (JVMClusterUtil.MasterThread t : masters) {
-        if (t.master.isActiveMaster()) {
-          return t.master.getServerName().toString();
+      for (HMaster t : masters) {
+        if (t.isActiveMaster()) {
+          return t.getServerName().toString();
         }
       }
       try {
@@ -198,15 +167,17 @@ public class JVMClusterUtil {
    * @param master
    * @param regionservers
    */
-  public static void shutdown(final List<MasterThread> masters,
+  public static void shutdown(final List<HMaster> masters,
       final List<RegionServerThread> regionservers) {
     LOG.debug("Shutting down HBase Cluster");
     if (masters != null) {
-      for (JVMClusterUtil.MasterThread t : masters) {
-        if (t.master.isActiveMaster()) {
-          t.master.shutdown();
+      for (HMaster t : masters) {
+        if (t.isActiveMaster()) {
+          // This will trigger cluster shutdown.
+          t.shutdown();
         } else {
-          t.master.stopMaster();
+          // This will only stop this particular master.
+          t.stopMaster();
         }
       }
     }
@@ -226,7 +197,7 @@ public class JVMClusterUtil {
       }
 
       if (masters != null) {
-        for (JVMClusterUtil.MasterThread t : masters) {
+        for (HMaster t : masters) {
           while (t.isAlive()) {
             try {
               t.join();

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java Mon Oct 24 22:30:48 2011
@@ -165,8 +165,9 @@ public class ZooKeeperWrapper implements
   // creates only one instance
   public static ZooKeeperWrapper createInstance(Configuration conf, String name)
   throws IOException {
-    if (getInstance(conf, name) != null) {
-      return getInstance(conf, name);
+    ZooKeeperWrapper zkw = getInstance(conf, name);
+    if (zkw != null) {
+      return zkw;
     }
     ZooKeeperWrapper.createLock.lock();
     try {

Modified: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java (original)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java Mon Oct 24 22:30:48 2011
@@ -226,12 +226,11 @@ public class HBaseTestingUtility {
    * Can only create one.
    * @param dir Where to home your dfs cluster.
    * @param servers How many DNs to start.
-   * @throws Exception
    * @see {@link #shutdownMiniDFSCluster()}
    * @return The mini dfs cluster created.
    */
   public MiniDFSCluster startMiniDFSCluster(int servers, final File dir)
-  throws Exception {
+      throws IOException {
     // This does the following to home the minidfscluster
     //     base_dir = new File(System.getProperty("test.build.data", "build/test/data"), "dfs/");
     // Some tests also do this:
@@ -270,7 +269,7 @@ public class HBaseTestingUtility {
   }
 
   private MiniZooKeeperCluster startMiniZKCluster(final File dir)
-  throws Exception {
+      throws IOException, InterruptedException {
     if (this.zkCluster != null) {
       throw new IOException("Cluster already running at " + dir);
     }
@@ -298,7 +297,25 @@ public class HBaseTestingUtility {
    * @see {@link #shutdownMiniDFSCluster()}
    */
   public MiniHBaseCluster startMiniCluster() throws Exception {
-    return startMiniCluster(1);
+    return startMiniCluster(1, 1);
+  }
+
+  /**
+   * Start up a minicluster of hbase, optionally dfs, and zookeeper.
+   * Modifies Configuration.  Homes the cluster data directory under a random
+   * subdirectory in a directory under System property test.build.data.
+   * Directory is cleaned up on exit.
+   * @param numSlaves Number of slaves to start up.  We'll start this many
+   * datanodes and regionservers.  If numSlaves is > 1, then make sure
+   * hbase.regionserver.info.port is -1 (i.e. no ui per regionserver) otherwise
+   * bind errors.
+   * @throws Exception
+   * @see {@link #shutdownMiniCluster()}
+   * @return Mini hbase cluster instance created.
+   */
+  public MiniHBaseCluster startMiniCluster(final int numSlaves)
+      throws IOException, InterruptedException {
+    return startMiniCluster(1, numSlaves);
   }
 
   /**
@@ -306,16 +323,19 @@ public class HBaseTestingUtility {
    * Modifies Configuration.  Homes the cluster data directory under a random
    * subdirectory in a directory under System property test.build.data.
    * Directory is cleaned up on exit.
-   * @param servers Number of servers to start up.  We'll start this many
-   * datanodes and regionservers.  If servers is > 1, then make sure
+   * @param numMasters Number of masters to start up.  We'll start this many
+   * hbase masters.  If numMasters > 1, you can find the active/primary master
+   * with {@link MiniHBaseCluster#getMaster()}.
+   * @param numSlaves Number of slave servers to start up.  We'll start this
+   * many datanodes and regionservers.  If servers is > 1, then make sure
    * hbase.regionserver.info.port is -1 (i.e. no ui per regionserver) otherwise
    * bind errors.
    * @throws Exception
    * @see {@link #shutdownMiniCluster()}
    * @return Mini hbase cluster instance created.
    */
-  public MiniHBaseCluster startMiniCluster(final int servers)
-  throws Exception {
+  public MiniHBaseCluster startMiniCluster(final int numMasters,
+      final int numSlaves) throws IOException, InterruptedException {
     LOG.info("Starting up minicluster");
     // If we already put up a cluster, fail.
     isRunningCluster();
@@ -325,7 +345,7 @@ public class HBaseTestingUtility {
     System.setProperty(TEST_DIRECTORY_KEY, this.clusterTestBuildDir.getPath());
     // Bring up mini dfs cluster. This spews a bunch of warnings about missing
     // scheme. Complaints are 'Scheme is undefined for build/test/data/dfs/name1'.
-    startMiniDFSCluster(servers, this.clusterTestBuildDir);
+    startMiniDFSCluster(numSlaves, this.clusterTestBuildDir);
 
     // Mangle conf so fs parameter points to minidfs we just started up
     FileSystem fs = this.dfsCluster.getFileSystem();
@@ -344,7 +364,7 @@ public class HBaseTestingUtility {
     this.conf.set(HConstants.HBASE_DIR, hbaseRootdir.toString());
     fs.mkdirs(hbaseRootdir);
     FSUtils.setVersion(fs, hbaseRootdir);
-    this.hbaseCluster = new MiniHBaseCluster(this.conf, servers);
+    this.hbaseCluster = new MiniHBaseCluster(this.conf, numMasters, numSlaves);
     // Don't leave here till we've done a successful scan of the .META.
     HTable t = new HTable(this.conf, HConstants.META_TABLE_NAME);
     ResultScanner s = t.getScanner(new Scan());

Modified: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java?rev=1188421&r1=1188420&r2=1188421&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java (original)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java Mon Oct 24 22:30:48 2011
@@ -509,4 +509,56 @@ public class MiniHBaseCluster {
   throws IOException {
     ((MiniHBaseClusterMaster)getMaster()).addMessage(hrs.getHServerInfo(), msg);
   }
+
+  /**
+   * @return List of master threads.
+   */
+  public List<HMaster> getMasters() {
+    return this.hbaseCluster.getMasters();
+  }
+
+  /**
+  * Kill the specified master cleanly. Does not result in a cluster shutdown.
+  *
+  * @param serverNumber Used as index into a list.
+  * @return the master that was stopped
+  */
+  public HMaster killMaster(int serverNumber) {
+    HMaster server = hbaseCluster.getMasters().get(serverNumber);
+    LOG.info("Killing master " + server.toString());
+    server.killMaster();
+    return server;
+  }
+
+  /**
+   * Wait for the specified master to stop. Removes this thread from list
+   * of running threads.
+   * @param serverNumber
+   * @return Name of master that just went down.
+   */
+  public String waitOnMasterStop(final int serverNumber) {
+    return this.hbaseCluster.waitOnMasterStop(serverNumber);
+  }
+
+  /**
+   * Blocks until there is an active master and that master has completed
+   * initialization.
+   *
+   * @return true if an active master becomes available.  false if there are no
+   *         masters left.
+   * @throws InterruptedException
+   */
+  public boolean waitForActiveAndReadyMaster() throws InterruptedException {
+    List<HMaster> masters;
+    while ((masters = getMasters()).size() > 0) {
+      for (HMaster master : masters) {
+        if (master.isActiveMaster()) {
+          return true;
+        }
+      }
+      Thread.sleep(200);
+    }
+    return false;
+  }
+
 }

Added: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java?rev=1188421&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java (added)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java Mon Oct 24 22:30:48 2011
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.MiniHBaseCluster;
+import org.junit.Test;
+
+public class TestMasterFailover {
+  private static final Log LOG = LogFactory.getLog(TestMasterFailover.class);
+
+  /**
+   * Simple test of master failover.
+   * <p>
+   * Starts with three masters.  Kills a backup master.  Then kills the active
+   * master.  Ensures the final master becomes active and we can still contact
+   * the cluster.
+   * @throws Exception
+   */
+  @Test
+  public void testSimpleMasterFailover() throws Exception {
+
+    final int NUM_MASTERS = 3;
+    final int NUM_RS = 3;
+
+    // Start the cluster
+    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+    TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
+    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+
+    // get all the master threads
+    List<HMaster> HMasters = cluster.getMasters();
+
+    // make sure all masters came online
+    for (HMaster mt : HMasters) {
+      assertTrue(mt.isAlive());
+    }
+
+    // verify only one is the active master and we have right number
+    int numActive = 0;
+    int activeIndex = -1;
+    String activeName = null;
+    for (int i = 0; i < HMasters.size(); i++) {
+      if (HMasters.get(i).isActiveMaster()) {
+        numActive++;
+        activeIndex = i;
+        activeName = HMasters.get(i).getServerName();
+      }
+    }
+    assertEquals(1, numActive);
+    assertEquals(NUM_MASTERS, HMasters.size());
+
+    // attempt to stop one of the inactive masters
+    int backupIndex = (activeIndex + 1) % HMasters.size();
+    LOG.debug("\n\nStopping backup master (#" + backupIndex + ")\n");
+    cluster.killMaster(backupIndex);
+    cluster.waitOnMasterStop(backupIndex);
+
+    // verify still one active master and it's the same
+    for (int i = 0; i < HMasters.size(); i++) {
+      if (HMasters.get(i).isActiveMaster()) {
+        assertTrue(activeName.equals(
+            HMasters.get(i).getServerName()));
+        activeIndex = i;
+      }
+    }
+    assertTrue(activeIndex != -1);
+    assertEquals(1, numActive);
+    assertEquals(2, HMasters.size());
+
+    // kill the active master
+    LOG.debug("\n\nStopping the active master (#" + activeIndex + ")\n");
+    cluster.killMaster(activeIndex);
+    cluster.waitOnMasterStop(activeIndex);
+
+    // wait for an active master to show up and be ready
+    assertTrue(cluster.waitForActiveAndReadyMaster());
+
+    LOG.debug("\n\nVerifying backup master is now active\n");
+    // should only have one master now
+    assertEquals(1, HMasters.size());
+    // and he should be active
+    assertTrue(HMasters.get(0).isActiveMaster());
+
+    // Stop the cluster
+    TEST_UTIL.shutdownMiniCluster();
+  }
+
+}