You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by te...@apache.org on 2011/12/30 07:36:54 UTC

svn commit: r1225764 - in /hbase/branches/0.92: CHANGES.txt src/main/java/org/apache/hadoop/hbase/master/HMaster.java src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java

Author: tedyu
Date: Fri Dec 30 06:36:54 2011
New Revision: 1225764

URL: http://svn.apache.org/viewvc?rev=1225764&view=rev
Log:
HBASE-5099  ZK event thread waiting for root region assignment may block server
               shutdown handler for the region sever the root region was on (Jimmy)

Added:
    hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java
Modified:
    hbase/branches/0.92/CHANGES.txt
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Modified: hbase/branches/0.92/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/CHANGES.txt?rev=1225764&r1=1225763&r2=1225764&view=diff
==============================================================================
--- hbase/branches/0.92/CHANGES.txt (original)
+++ hbase/branches/0.92/CHANGES.txt Fri Dec 30 06:36:54 2011
@@ -501,6 +501,8 @@ Release 0.92.0 - Unreleased
    HBASE-5077  SplitLogWorker fails to let go of a task, kills the RS
    HBASE-5096  Replication does not handle deletes correctly. (Lars H)
    HBASE-5103  Fix improper master znode deserialization (Jonathan Hsieh)
+   HBASE-5099  ZK event thread waiting for root region assignment may block server
+               shutdown handler for the region sever the root region was on (Jimmy)
 
   TESTS
    HBASE-4492  TestRollingRestart fails intermittently

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1225764&r1=1225763&r2=1225764&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Fri Dec 30 06:36:54 2011
@@ -27,8 +27,13 @@ import java.net.InetSocketAddress;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
-import java.util.concurrent.atomic.AtomicReference;
 import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -56,7 +61,6 @@ import org.apache.hadoop.hbase.client.Re
 import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
 import org.apache.hadoop.hbase.executor.ExecutorService;
 import org.apache.hadoop.hbase.executor.ExecutorService.ExecutorType;
-import org.apache.hadoop.hbase.io.hfile.CacheConfig;
 import org.apache.hadoop.hbase.ipc.HBaseRPC;
 import org.apache.hadoop.hbase.ipc.HBaseServer;
 import org.apache.hadoop.hbase.ipc.HMasterInterface;
@@ -72,7 +76,6 @@ import org.apache.hadoop.hbase.master.ha
 import org.apache.hadoop.hbase.master.handler.TableDeleteFamilyHandler;
 import org.apache.hadoop.hbase.master.handler.TableModifyFamilyHandler;
 import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
-import org.apache.hadoop.hbase.master.RegionPlan;
 import org.apache.hadoop.hbase.monitoring.MemoryBoundedLogMessageBuffer;
 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
@@ -88,8 +91,8 @@ import org.apache.hadoop.hbase.util.Thre
 import org.apache.hadoop.hbase.util.VersionInfo;
 import org.apache.hadoop.hbase.zookeeper.ClusterId;
 import org.apache.hadoop.hbase.zookeeper.ClusterStatusTracker;
-import org.apache.hadoop.hbase.zookeeper.RegionServerTracker;
 import org.apache.hadoop.hbase.zookeeper.DrainingServerTracker;
+import org.apache.hadoop.hbase.zookeeper.RegionServerTracker;
 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
 import org.apache.hadoop.io.MapWritable;
 import org.apache.hadoop.io.Text;
@@ -1235,7 +1238,9 @@ implements HMasterInterface, HMasterRegi
   }
 
   /**
-   * We do the following.
+   * We do the following in a different thread.  If it is not completed
+   * in time, we will time it out and assume it is not easy to recover.
+   *
    * 1. Create a new ZK session. (since our current one is expired)
    * 2. Try to become a primary master again
    * 3. Initialize all ZK based system trackers.
@@ -1246,29 +1251,53 @@ implements HMasterInterface, HMasterRegi
    * @return True if we could successfully recover from ZK session expiry.
    * @throws InterruptedException
    * @throws IOException
+   * @throws KeeperException
+   * @throws ExecutionException
    */
   private boolean tryRecoveringExpiredZKSession() throws InterruptedException,
-      IOException, KeeperException {
+      IOException, KeeperException, ExecutionException {
+
     this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":"
-        + this.serverName.getPort(), this, true);
+      + this.serverName.getPort(), this, true);
 
-    MonitoredTask status = 
-      TaskMonitor.get().createStatus("Recovering expired ZK session");
-    try {
-      if (!becomeActiveMaster(status)) {
-        return false;
+    Callable<Boolean> callable = new Callable<Boolean> () {
+      public Boolean call() throws InterruptedException,
+          IOException, KeeperException {
+        MonitoredTask status =
+          TaskMonitor.get().createStatus("Recovering expired ZK session");
+        try {
+          if (!becomeActiveMaster(status)) {
+            return Boolean.FALSE;
+          }
+          initializeZKBasedSystemTrackers();
+          // Update in-memory structures to reflect our earlier Root/Meta assignment.
+          assignRootAndMeta(status);
+          // process RIT if any
+          // TODO: Why does this not call AssignmentManager.joinCluster?  Otherwise
+          // we are not processing dead servers if any.
+          assignmentManager.processDeadServersAndRegionsInTransition();
+          return Boolean.TRUE;
+        } finally {
+          status.cleanup();
+        }
+      }
+    };
+
+    long timeout =
+      conf.getLong("hbase.master.zksession.recover.timeout", 300000);
+    java.util.concurrent.ExecutorService executor =
+      Executors.newSingleThreadExecutor();
+    Future<Boolean> result = executor.submit(callable);
+    executor.shutdown();
+    if (executor.awaitTermination(timeout, TimeUnit.MILLISECONDS)
+        && result.isDone()) {
+      Boolean recovered = result.get();
+      if (recovered != null) {
+        return recovered.booleanValue();
       }
-      initializeZKBasedSystemTrackers();
-      // Update in-memory structures to reflect our earlier Root/Meta assignment.
-      assignRootAndMeta(status);
-      // process RIT if any
-      // TODO: Why does this not call AssignmentManager.joinCluster?  Otherwise
-      // we are not processing dead servers if any.
-      this.assignmentManager.processDeadServersAndRegionsInTransition();
-      return true;
-    } finally {
-      status.cleanup();
     }
+    executor.shutdownNow();
+    return false;
   }
 
   /**

Added: hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java?rev=1225764&view=auto
==============================================================================
--- hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java (added)
+++ hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java Fri Dec 30 06:36:54 2011
@@ -0,0 +1,93 @@
+/**
+ * Copyright The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.MiniHBaseCluster;
+import org.apache.zookeeper.KeeperException;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for master to recover from ZK session expiry.
+ */
+public class TestMasterZKSessionRecovery {
+  private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+
+  /**
+   * The default timeout is 5 minutes.
+   * Shorten it so that the test won't wait for too long.
+   */
+  static {
+    Configuration conf = TEST_UTIL.getConfiguration();
+    conf.setLong("hbase.master.zksession.recover.timeout", 50000);
+  }
+
+  @Before
+  public void setUp() throws Exception {
+    // Start a cluster of one regionserver.
+    TEST_UTIL.startMiniCluster(1);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    TEST_UTIL.shutdownMiniCluster();
+  }
+
+  /**
+   * Negative test of master recovery from zk session expiry.
+   * <p>
+   * Starts with one master. Fakes the master zk session expired.
+   * Ensures the master cannot recover the expired zk session since
+   * the master zk node is still there.
+   * @throws Exception
+   */
+  @Test(timeout=10000)
+  public void testMasterZKSessionRecoveryFailure() throws Exception {
+    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+    HMaster m = cluster.getMaster();
+    m.abort("Test recovery from zk session expired",
+      new KeeperException.SessionExpiredException());
+    assertTrue(m.isStopped());
+  }
+
+  /**
+   * Positive test of master recovery from zk session expiry.
+   * <p>
+   * Starts with one master. Closes the master zk session.
+   * Ensures the master can recover the expired zk session.
+   * @throws Exception
+   */
+  @Test(timeout=60000)
+  public void testMasterZKSessionRecoverySuccess() throws Exception {
+    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+    HMaster m = cluster.getMaster();
+    m.getZooKeeperWatcher().close();
+    m.abort("Test recovery from zk session expired",
+      new KeeperException.SessionExpiredException());
+    assertFalse(m.isStopped());
+  }
+}
+