You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ra...@apache.org on 2012/06/05 18:40:47 UTC

svn commit: r1346461 - in /hbase/branches/0.94/src: main/java/org/apache/hadoop/hbase/master/ test/java/org/apache/hadoop/hbase/master/

Author: ramkrishna
Date: Tue Jun  5 16:40:47 2012
New Revision: 1346461

URL: http://svn.apache.org/viewvc?rev=1346461&view=rev
Log:
HBASE-6046 Master retry on ZK session expiry causes inconsistent region assignments. (Ashutosh)

Modified:
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
    hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java
    hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1346461&r1=1346460&r2=1346461&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue Jun  5 16:40:47 2012
@@ -341,7 +341,7 @@ Server {
 
       // We are either the active master or we were asked to shutdown
       if (!this.stopped) {
-        finishInitialization(startupStatus);
+        finishInitialization(startupStatus, false);
         loop();
       }
     } catch (Throwable t) {
@@ -459,12 +459,13 @@ Server {
    * <li>Ensure assignment of root and meta regions<li>
    * <li>Handle either fresh cluster start or master failover</li>
    * </ol>
+   * @param masterRecovery 
    *
    * @throws IOException
    * @throws InterruptedException
    * @throws KeeperException
    */
-  private void finishInitialization(MonitoredTask status)
+  private void finishInitialization(MonitoredTask status, boolean masterRecovery)
   throws IOException, InterruptedException, KeeperException {
 
     isActiveMaster = true;
@@ -478,7 +479,7 @@ Server {
     status.setStatus("Initializing Master file system");
     this.masterActiveTime = System.currentTimeMillis();
     // TODO: Do this using Dependency Injection, using PicoContainer, Guice or Spring.
-    this.fileSystemManager = new MasterFileSystem(this, this, metrics);
+    this.fileSystemManager = new MasterFileSystem(this, this, metrics, masterRecovery);
 
     this.tableDescriptors =
       new FSTableDescriptors(this.fileSystemManager.getFileSystem(),
@@ -487,21 +488,24 @@ Server {
     // publish cluster ID
     status.setStatus("Publishing Cluster ID in ZooKeeper");
     ClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId());
+    if (!masterRecovery) {
+      this.executorService = new ExecutorService(getServerName().toString());
+      this.serverManager = new ServerManager(this, this);
+    }
 
-    this.executorService = new ExecutorService(getServerName().toString());
-
-    this.serverManager = new ServerManager(this, this);
 
     status.setStatus("Initializing ZK system trackers");
     initializeZKBasedSystemTrackers();
-
-    // initialize master side coprocessors before we start handling requests
-    status.setStatus("Initializing master coprocessors");
-    this.cpHost = new MasterCoprocessorHost(this, this.conf);
-
-    // start up all service threads.
-    status.setStatus("Initializing master service threads");
-    startServiceThreads();
+    
+    if (!masterRecovery) {
+      // initialize master side coprocessors before we start handling requests
+      status.setStatus("Initializing master coprocessors");
+      this.cpHost = new MasterCoprocessorHost(this, this.conf);
+
+      // start up all service threads.
+      status.setStatus("Initializing master service threads");
+      startServiceThreads();
+    }
 
     // Wait for region servers to report in.
     this.serverManager.waitForRegionServers(status);
@@ -514,8 +518,9 @@ Server {
         this.serverManager.recordNewServer(sn, HServerLoad.EMPTY_HSERVERLOAD);
       }
     }
-
-    this.assignmentManager.startTimeOutMonitor();
+    if (!masterRecovery) {
+      this.assignmentManager.startTimeOutMonitor();
+    }
     // TODO: Should do this in background rather than block master startup
     status.setStatus("Splitting logs after master startup");
     splitLogAfterStartup(this.fileSystemManager);
@@ -543,13 +548,15 @@ Server {
     status.setStatus("Fixing up missing daughters");
     fixupDaughters(status);
 
-    // Start balancer and meta catalog janitor after meta and regions have
-    // been assigned.
-    status.setStatus("Starting balancer and catalog janitor");
-    this.balancerChore = getAndStartBalancerChore(this);
-    this.catalogJanitorChore = new CatalogJanitor(this, this);
-    startCatalogJanitorChore();
-    registerMBean();
+    if (!masterRecovery) {
+      // Start balancer and meta catalog janitor after meta and regions have
+      // been assigned.
+      status.setStatus("Starting balancer and catalog janitor");
+      this.balancerChore = getAndStartBalancerChore(this);
+      this.catalogJanitorChore = new CatalogJanitor(this, this);
+      startCatalogJanitorChore();
+      registerMBean();
+    }
 
     status.markComplete("Initialization successful");
     LOG.info("Master has completed initialization");
@@ -560,12 +567,14 @@ Server {
     // master initialization. See HBASE-5916.
     this.serverManager.clearDeadServersWithSameHostNameAndPortOfOnlineServer();
     
-    if (this.cpHost != null) {
-      // don't let cp initialization errors kill the master
-      try {
-        this.cpHost.postStartMaster();
-      } catch (IOException ioe) {
-        LOG.error("Coprocessor postStartMaster() hook failed", ioe);
+    if (!masterRecovery) {
+      if (this.cpHost != null) {
+        // don't let cp initialization errors kill the master
+        try {
+          this.cpHost.postStartMaster();
+        } catch (IOException ioe) {
+          LOG.error("Coprocessor postStartMaster() hook failed", ioe);
+        }
       }
     }
   }
@@ -1433,13 +1442,9 @@ Server {
           if (!becomeActiveMaster(status)) {
             return Boolean.FALSE;
           }
-          initializeZKBasedSystemTrackers();
-          // Update in-memory structures to reflect our earlier Root/Meta assignment.
-          assignRootAndMeta(status);
-          // process RIT if any
-          // TODO: Why does this not call AssignmentManager.joinCluster?  Otherwise
-          // we are not processing dead servers if any.
-          assignmentManager.processDeadServersAndRegionsInTransition();
+          serverShutdownHandlerEnabled = false;
+          initialized = false;
+          finishInitialization(status, true);
           return Boolean.TRUE;
         } finally {
           status.cleanup();

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java?rev=1346461&r1=1346460&r2=1346461&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java Tue Jun  5 16:40:47 2012
@@ -81,7 +81,7 @@ public class MasterFileSystem {
   private final MasterServices services;
 
   public MasterFileSystem(Server master, MasterServices services,
-      MasterMetrics metrics)
+      MasterMetrics metrics, boolean masterRecovery)
   throws IOException {
     this.conf = master.getConfiguration();
     this.master = master;
@@ -103,7 +103,7 @@ public class MasterFileSystem {
     if (this.distributedLogSplitting) {
       this.splitLogManager = new SplitLogManager(master.getZooKeeper(),
           master.getConfiguration(), master, master.getServerName().toString());
-      this.splitLogManager.finishInitialization();
+      this.splitLogManager.finishInitialization(masterRecovery);
     } else {
       this.splitLogManager = null;
     }

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java?rev=1346461&r1=1346460&r2=1346461&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java Tue Jun  5 16:40:47 2012
@@ -182,9 +182,11 @@ public class SplitLogManager extends Zoo
         stopper);
   }
 
-  public void finishInitialization() {
-    Threads.setDaemonThreadRunning(timeoutMonitor.getThread(), serverName +
-      ".splitLogManagerTimeoutMonitor");
+  public void finishInitialization(boolean masterRecovery) {
+    if (!masterRecovery) {
+      Threads.setDaemonThreadRunning(timeoutMonitor.getThread(), serverName
+          + ".splitLogManagerTimeoutMonitor");
+    }
     // Watcher can be null during tests with Mock'd servers.
     if (this.watcher != null) {
       this.watcher.registerListener(this);
@@ -1196,4 +1198,11 @@ public class SplitLogManager extends Zoo
       return statusMsg;
     }
   }
+  
+  /**
+   * Completes the initialization
+   */
+  public void finishInitialization() {
+    finishInitialization(false);
+  }
 }

Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java?rev=1346461&r1=1346460&r2=1346461&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java Tue Jun  5 16:40:47 2012
@@ -148,7 +148,7 @@ public class TestCatalogJanitor {
     private final AssignmentManager asm;
 
     MockMasterServices(final Server server) throws IOException {
-      this.mfs = new MasterFileSystem(server, this, null);
+      this.mfs = new MasterFileSystem(server, this, null, false);
       this.asm = Mockito.mock(AssignmentManager.class);
     }
 

Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java?rev=1346461&r1=1346460&r2=1346461&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java (original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java Tue Jun  5 16:40:47 2012
@@ -19,13 +19,31 @@
  */
 package org.apache.hadoop.hbase.master;
 
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HColumnDescriptor;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.HTableDescriptor;
 import org.apache.hadoop.hbase.MediumTests;
 import org.apache.hadoop.hbase.MiniHBaseCluster;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.client.HBaseAdmin;
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.ResultScanner;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.zookeeper.ZKAssign;
+import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
 import org.apache.zookeeper.KeeperException;
 import org.junit.After;
 import org.junit.Before;
@@ -46,6 +64,8 @@ public class TestMasterZKSessionRecovery
   static {
     Configuration conf = TEST_UTIL.getConfiguration();
     conf.setLong("hbase.master.zksession.recover.timeout", 50000);
+    conf.setClass(HConstants.HBASE_MASTER_LOADBALANCER_CLASS,
+        MockLoadBalancer.class, LoadBalancer.class);
   }
 
   @Before
@@ -92,5 +112,93 @@ public class TestMasterZKSessionRecovery
       new KeeperException.SessionExpiredException());
     assertFalse(m.isStopped());
   }
+  
+  /**
+   * Tests that the master does not call retainAssignment after recovery from
+   * expired zookeeper session. Without the HBASE-6046 fix master always tries
+   * to assign all the user regions by calling retainAssignment.
+   */
+  @Test
+  public void testRegionAssignmentAfterMasterRecoveryDueToZKExpiry() throws Exception {
+    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+    cluster.startRegionServer();
+    HMaster m = cluster.getMaster();
+    // now the cluster is up. So assign some regions.
+    HBaseAdmin admin = new HBaseAdmin(TEST_UTIL.getConfiguration());
+    byte[][] SPLIT_KEYS = new byte[][] { Bytes.toBytes("a"), Bytes.toBytes("b"),
+        Bytes.toBytes("c"), Bytes.toBytes("d"), Bytes.toBytes("e"), Bytes.toBytes("f"),
+        Bytes.toBytes("g"), Bytes.toBytes("h"), Bytes.toBytes("i"), Bytes.toBytes("j") };
+
+    String tableName = "testRegionAssignmentAfterMasterRecoveryDueToZKExpiry";
+    admin.createTable(new HTableDescriptor(tableName), SPLIT_KEYS);
+    ZooKeeperWatcher zooKeeperWatcher = HBaseTestingUtility.getZooKeeperWatcher(TEST_UTIL);
+    ZKAssign.blockUntilNoRIT(zooKeeperWatcher);
+    m.getZooKeeperWatcher().close();
+    MockLoadBalancer.retainAssignCalled = false;
+    m.abort("Test recovery from zk session expired", new KeeperException.SessionExpiredException());
+    assertFalse(m.isStopped());
+    // The recovered master should not call retainAssignment, as it is not a
+    // clean startup.
+    assertFalse("Retain assignment should not be called", MockLoadBalancer.retainAssignCalled);
+  }
+
+  static class MockLoadBalancer extends DefaultLoadBalancer {
+    static boolean retainAssignCalled = false;
+
+    @Override
+    public Map<ServerName, List<HRegionInfo>> retainAssignment(
+        Map<HRegionInfo, ServerName> regions, List<ServerName> servers) {
+      retainAssignCalled = true;
+      return super.retainAssignment(regions, servers);
+    }
+  }
+
+  /**
+   * Tests whether the logs are split when master recovers from a expired
+   * zookeeper session and an RS goes down.
+   */
+  @Test(timeout = 60000)
+  public void testLogSplittingAfterMasterRecoveryDueToZKExpiry() throws IOException,
+      KeeperException, InterruptedException {
+    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+    cluster.startRegionServer();
+    HMaster m = cluster.getMaster();
+    // now the cluster is up. So assign some regions.
+    HBaseAdmin admin = new HBaseAdmin(TEST_UTIL.getConfiguration());
+    byte[][] SPLIT_KEYS = new byte[][] { Bytes.toBytes("1"), Bytes.toBytes("2"),
+        Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5") };
+
+    String tableName = "testLogSplittingAfterMasterRecoveryDueToZKExpiry";
+    HTableDescriptor htd = new HTableDescriptor(tableName);
+    HColumnDescriptor hcd = new HColumnDescriptor("col");
+    htd.addFamily(hcd);
+    admin.createTable(htd, SPLIT_KEYS);
+    ZooKeeperWatcher zooKeeperWatcher = HBaseTestingUtility.getZooKeeperWatcher(TEST_UTIL);
+    ZKAssign.blockUntilNoRIT(zooKeeperWatcher);
+    HTable table = new HTable(TEST_UTIL.getConfiguration(), tableName);
+
+    Put p = null;
+    int numberOfPuts = 0;
+    for (numberOfPuts = 0; numberOfPuts < 6; numberOfPuts++) {
+      p = new Put(Bytes.toBytes(numberOfPuts));
+      p.add(Bytes.toBytes("col"), Bytes.toBytes("ql"), Bytes.toBytes("value" + numberOfPuts));
+      table.put(p);
+    }
+    m.getZooKeeperWatcher().close();
+    m.abort("Test recovery from zk session expired", new KeeperException.SessionExpiredException());
+    assertFalse(m.isStopped());
+    cluster.getRegionServer(0).abort("Aborting");
+    // Without patch for HBASE-6046 this test case will always timeout
+    // with patch the test case should pass.
+    Scan scan = new Scan();
+    int numberOfRows = 0;
+    ResultScanner scanner = table.getScanner(scan);
+    Result[] result = scanner.next(1);
+    while (result != null && result.length > 0) {
+      numberOfRows++;
+      result = scanner.next(1);
+    }
+    assertEquals("Number of rows should be equal to number of puts.", numberOfPuts, numberOfRows);
+  }
 }