You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by te...@apache.org on 2011/12/30 07:36:54 UTC
svn commit: r1225764 - in /hbase/branches/0.92: CHANGES.txt
src/main/java/org/apache/hadoop/hbase/master/HMaster.java
src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java
Author: tedyu
Date: Fri Dec 30 06:36:54 2011
New Revision: 1225764
URL: http://svn.apache.org/viewvc?rev=1225764&view=rev
Log:
HBASE-5099 ZK event thread waiting for root region assignment may block server
shutdown handler for the region sever the root region was on (Jimmy)
Added:
hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java
Modified:
hbase/branches/0.92/CHANGES.txt
hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
Modified: hbase/branches/0.92/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/CHANGES.txt?rev=1225764&r1=1225763&r2=1225764&view=diff
==============================================================================
--- hbase/branches/0.92/CHANGES.txt (original)
+++ hbase/branches/0.92/CHANGES.txt Fri Dec 30 06:36:54 2011
@@ -501,6 +501,8 @@ Release 0.92.0 - Unreleased
HBASE-5077 SplitLogWorker fails to let go of a task, kills the RS
HBASE-5096 Replication does not handle deletes correctly. (Lars H)
HBASE-5103 Fix improper master znode deserialization (Jonathan Hsieh)
+ HBASE-5099 ZK event thread waiting for root region assignment may block server
+ shutdown handler for the region sever the root region was on (Jimmy)
TESTS
HBASE-4492 TestRollingRestart fails intermittently
Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1225764&r1=1225763&r2=1225764&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Fri Dec 30 06:36:54 2011
@@ -27,8 +27,13 @@ import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
-import java.util.concurrent.atomic.AtomicReference;
import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -56,7 +61,6 @@ import org.apache.hadoop.hbase.client.Re
import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
import org.apache.hadoop.hbase.executor.ExecutorService;
import org.apache.hadoop.hbase.executor.ExecutorService.ExecutorType;
-import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.ipc.HBaseRPC;
import org.apache.hadoop.hbase.ipc.HBaseServer;
import org.apache.hadoop.hbase.ipc.HMasterInterface;
@@ -72,7 +76,6 @@ import org.apache.hadoop.hbase.master.ha
import org.apache.hadoop.hbase.master.handler.TableDeleteFamilyHandler;
import org.apache.hadoop.hbase.master.handler.TableModifyFamilyHandler;
import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
-import org.apache.hadoop.hbase.master.RegionPlan;
import org.apache.hadoop.hbase.monitoring.MemoryBoundedLogMessageBuffer;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
@@ -88,8 +91,8 @@ import org.apache.hadoop.hbase.util.Thre
import org.apache.hadoop.hbase.util.VersionInfo;
import org.apache.hadoop.hbase.zookeeper.ClusterId;
import org.apache.hadoop.hbase.zookeeper.ClusterStatusTracker;
-import org.apache.hadoop.hbase.zookeeper.RegionServerTracker;
import org.apache.hadoop.hbase.zookeeper.DrainingServerTracker;
+import org.apache.hadoop.hbase.zookeeper.RegionServerTracker;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
@@ -1235,7 +1238,9 @@ implements HMasterInterface, HMasterRegi
}
/**
- * We do the following.
+ * We do the following in a different thread. If it is not completed
+ * in time, we will time it out and assume it is not easy to recover.
+ *
* 1. Create a new ZK session. (since our current one is expired)
* 2. Try to become a primary master again
* 3. Initialize all ZK based system trackers.
@@ -1246,29 +1251,53 @@ implements HMasterInterface, HMasterRegi
* @return True if we could successfully recover from ZK session expiry.
* @throws InterruptedException
* @throws IOException
+ * @throws KeeperException
+ * @throws ExecutionException
*/
private boolean tryRecoveringExpiredZKSession() throws InterruptedException,
- IOException, KeeperException {
+ IOException, KeeperException, ExecutionException {
+
this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":"
- + this.serverName.getPort(), this, true);
+ + this.serverName.getPort(), this, true);
- MonitoredTask status =
- TaskMonitor.get().createStatus("Recovering expired ZK session");
- try {
- if (!becomeActiveMaster(status)) {
- return false;
+ Callable<Boolean> callable = new Callable<Boolean> () {
+ public Boolean call() throws InterruptedException,
+ IOException, KeeperException {
+ MonitoredTask status =
+ TaskMonitor.get().createStatus("Recovering expired ZK session");
+ try {
+ if (!becomeActiveMaster(status)) {
+ return Boolean.FALSE;
+ }
+ initializeZKBasedSystemTrackers();
+ // Update in-memory structures to reflect our earlier Root/Meta assignment.
+ assignRootAndMeta(status);
+ // process RIT if any
+ // TODO: Why does this not call AssignmentManager.joinCluster? Otherwise
+ // we are not processing dead servers if any.
+ assignmentManager.processDeadServersAndRegionsInTransition();
+ return Boolean.TRUE;
+ } finally {
+ status.cleanup();
+ }
+ }
+ };
+
+ long timeout =
+ conf.getLong("hbase.master.zksession.recover.timeout", 300000);
+ java.util.concurrent.ExecutorService executor =
+ Executors.newSingleThreadExecutor();
+ Future<Boolean> result = executor.submit(callable);
+ executor.shutdown();
+ if (executor.awaitTermination(timeout, TimeUnit.MILLISECONDS)
+ && result.isDone()) {
+ Boolean recovered = result.get();
+ if (recovered != null) {
+ return recovered.booleanValue();
}
- initializeZKBasedSystemTrackers();
- // Update in-memory structures to reflect our earlier Root/Meta assignment.
- assignRootAndMeta(status);
- // process RIT if any
- // TODO: Why does this not call AssignmentManager.joinCluster? Otherwise
- // we are not processing dead servers if any.
- this.assignmentManager.processDeadServersAndRegionsInTransition();
- return true;
- } finally {
- status.cleanup();
}
+ executor.shutdownNow();
+ return false;
}
/**
Added: hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java?rev=1225764&view=auto
==============================================================================
--- hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java (added)
+++ hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java Fri Dec 30 06:36:54 2011
@@ -0,0 +1,93 @@
+/**
+ * Copyright The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.MiniHBaseCluster;
+import org.apache.zookeeper.KeeperException;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for master to recover from ZK session expiry.
+ */
+public class TestMasterZKSessionRecovery {
+ private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+
+ /**
+ * The default timeout is 5 minutes.
+ * Shorten it so that the test won't wait for too long.
+ */
+ static {
+ Configuration conf = TEST_UTIL.getConfiguration();
+ conf.setLong("hbase.master.zksession.recover.timeout", 50000);
+ }
+
+ @Before
+ public void setUp() throws Exception {
+ // Start a cluster of one regionserver.
+ TEST_UTIL.startMiniCluster(1);
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ TEST_UTIL.shutdownMiniCluster();
+ }
+
+ /**
+ * Negative test of master recovery from zk session expiry.
+ * <p>
+ * Starts with one master. Fakes the master zk session expired.
+ * Ensures the master cannot recover the expired zk session since
+ * the master zk node is still there.
+ * @throws Exception
+ */
+ @Test(timeout=10000)
+ public void testMasterZKSessionRecoveryFailure() throws Exception {
+ MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+ HMaster m = cluster.getMaster();
+ m.abort("Test recovery from zk session expired",
+ new KeeperException.SessionExpiredException());
+ assertTrue(m.isStopped());
+ }
+
+ /**
+ * Positive test of master recovery from zk session expiry.
+ * <p>
+ * Starts with one master. Closes the master zk session.
+ * Ensures the master can recover the expired zk session.
+ * @throws Exception
+ */
+ @Test(timeout=60000)
+ public void testMasterZKSessionRecoverySuccess() throws Exception {
+ MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+ HMaster m = cluster.getMaster();
+ m.getZooKeeperWatcher().close();
+ m.abort("Test recovery from zk session expired",
+ new KeeperException.SessionExpiredException());
+ assertFalse(m.isStopped());
+ }
+}
+