You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by to...@apache.org on 2012/04/04 21:21:02 UTC
svn commit: r1309554 - in
/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common: ./
src/main/java/org/apache/hadoop/ha/ src/test/java/org/apache/hadoop/ha/
Author: todd
Date: Wed Apr 4 19:21:01 2012
New Revision: 1309554
URL: http://svn.apache.org/viewvc?rev=1309554&view=rev
Log:
HADOOP-8245. Fix flakiness in TestZKFailoverController. Contributed by Todd Lipcon.
Added:
hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java
Modified:
hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt
hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java
hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java
hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java
hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java
hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java
Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt Wed Apr 4 19:21:01 2012
@@ -9,3 +9,5 @@ HADOOP-8220. ZKFailoverController doesn'
HADOOP-8228. Auto HA: Refactor tests and add stress tests. (todd)
HADOOP-8215. Security support for ZK Failover controller (todd)
+
+HADOOP-8245. Fix flakiness in TestZKFailoverController (todd)
Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java Wed Apr 4 19:21:01 2012
@@ -240,8 +240,6 @@ public class ActiveStandbyElector implem
public synchronized void joinElection(byte[] data)
throws HadoopIllegalArgumentException {
- LOG.debug("Attempting active election");
-
if (data == null) {
throw new HadoopIllegalArgumentException("data cannot be null");
}
@@ -249,6 +247,7 @@ public class ActiveStandbyElector implem
appData = new byte[data.length];
System.arraycopy(data, 0, appData, 0, data.length);
+ LOG.debug("Attempting active election for " + this);
joinElectionInternal();
}
@@ -272,6 +271,9 @@ public class ActiveStandbyElector implem
*/
public synchronized void ensureParentZNode()
throws IOException, InterruptedException {
+ Preconditions.checkState(!wantToBeInElection,
+ "ensureParentZNode() may not be called while in the election");
+
String pathParts[] = znodeWorkingDir.split("/");
Preconditions.checkArgument(pathParts.length >= 1 &&
"".equals(pathParts[0]),
@@ -305,6 +307,9 @@ public class ActiveStandbyElector implem
*/
public synchronized void clearParentZNode()
throws IOException, InterruptedException {
+ Preconditions.checkState(!wantToBeInElection,
+ "clearParentZNode() may not be called while in the election");
+
try {
LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK...");
@@ -393,7 +398,8 @@ public class ActiveStandbyElector implem
String name) {
if (isStaleClient(ctx)) return;
LOG.debug("CreateNode result: " + rc + " for path: " + path
- + " connectionState: " + zkConnectionState);
+ + " connectionState: " + zkConnectionState +
+ " for " + this);
Code code = Code.get(rc);
if (isSuccess(code)) {
@@ -449,8 +455,13 @@ public class ActiveStandbyElector implem
public synchronized void processResult(int rc, String path, Object ctx,
Stat stat) {
if (isStaleClient(ctx)) return;
+
+ assert wantToBeInElection :
+ "Got a StatNode result after quitting election";
+
LOG.debug("StatNode result: " + rc + " for path: " + path
- + " connectionState: " + zkConnectionState);
+ + " connectionState: " + zkConnectionState + " for " + this);
+
Code code = Code.get(rc);
if (isSuccess(code)) {
@@ -517,7 +528,8 @@ public class ActiveStandbyElector implem
if (isStaleClient(zk)) return;
LOG.debug("Watcher event type: " + eventType + " with state:"
+ event.getState() + " for path:" + event.getPath()
- + " connectionState: " + zkConnectionState);
+ + " connectionState: " + zkConnectionState
+ + " for " + this);
if (eventType == Event.EventType.None) {
// the connection state has changed
@@ -528,7 +540,8 @@ public class ActiveStandbyElector implem
// be undone
ConnectionState prevConnectionState = zkConnectionState;
zkConnectionState = ConnectionState.CONNECTED;
- if (prevConnectionState == ConnectionState.DISCONNECTED) {
+ if (prevConnectionState == ConnectionState.DISCONNECTED &&
+ wantToBeInElection) {
monitorActiveStatus();
}
break;
@@ -600,12 +613,14 @@ public class ActiveStandbyElector implem
}
private void fatalError(String errorMessage) {
+ LOG.fatal(errorMessage);
reset();
appClient.notifyFatalError(errorMessage);
}
private void monitorActiveStatus() {
- LOG.debug("Monitoring active leader");
+ assert wantToBeInElection;
+ LOG.debug("Monitoring active leader for " + this);
statRetryCount = 0;
monitorLockNodeAsync();
}
@@ -688,7 +703,7 @@ public class ActiveStandbyElector implem
int connectionRetryCount = 0;
boolean success = false;
while(!success && connectionRetryCount < NUM_RETRIES) {
- LOG.debug("Establishing zookeeper connection");
+ LOG.debug("Establishing zookeeper connection for " + this);
try {
createConnection();
success = true;
@@ -703,13 +718,14 @@ public class ActiveStandbyElector implem
private void createConnection() throws IOException {
zkClient = getNewZooKeeper();
+ LOG.debug("Created new connection for " + this);
}
private void terminateConnection() {
if (zkClient == null) {
return;
}
- LOG.debug("Terminating ZK connection");
+ LOG.debug("Terminating ZK connection for " + this);
ZooKeeper tempZk = zkClient;
zkClient = null;
try {
@@ -735,7 +751,7 @@ public class ActiveStandbyElector implem
Stat oldBreadcrumbStat = fenceOldActive();
writeBreadCrumbNode(oldBreadcrumbStat);
- LOG.debug("Becoming active");
+ LOG.debug("Becoming active for " + this);
appClient.becomeActive();
state = State.ACTIVE;
return true;
@@ -838,7 +854,7 @@ public class ActiveStandbyElector implem
private void becomeStandby() {
if (state != State.STANDBY) {
- LOG.debug("Becoming standby");
+ LOG.debug("Becoming standby for " + this);
state = State.STANDBY;
appClient.becomeStandby();
}
@@ -846,7 +862,7 @@ public class ActiveStandbyElector implem
private void enterNeutralMode() {
if (state != State.NEUTRAL) {
- LOG.debug("Entering neutral mode");
+ LOG.debug("Entering neutral mode for " + this);
state = State.NEUTRAL;
appClient.enterNeutralMode();
}
@@ -943,8 +959,14 @@ public class ActiveStandbyElector implem
@Override
public void process(WatchedEvent event) {
- ActiveStandbyElector.this.processWatchEvent(
- zk, event);
+ try {
+ ActiveStandbyElector.this.processWatchEvent(
+ zk, event);
+ } catch (Throwable t) {
+ fatalError(
+ "Failed to process watcher event " + event + ": " +
+ StringUtils.stringifyException(t));
+ }
}
}
@@ -972,5 +994,13 @@ public class ActiveStandbyElector implem
}
return false;
}
+
+ @Override
+ public String toString() {
+ return "elector id=" + System.identityHashCode(this) +
+ " appData=" +
+ ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) +
+ " cb=" + appClient;
+ }
}
Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java Wed Apr 4 19:21:01 2012
@@ -154,6 +154,7 @@ public abstract class ZKFailoverControll
try {
mainLoop();
} finally {
+ elector.quitElection(true);
healthMonitor.shutdown();
healthMonitor.join();
}
@@ -379,6 +380,11 @@ public abstract class ZKFailoverControll
throw new RuntimeException("Unable to fence " + target);
}
}
+
+ @Override
+ public String toString() {
+ return "Elector callbacks for " + localTarget;
+ }
}
/**
Added: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java?rev=1309554&view=auto
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java (added)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java Wed Apr 4 19:21:01 2012
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.io.File;
+import java.util.Set;
+
+import javax.management.ObjectName;
+
+import org.apache.zookeeper.test.ClientBase;
+import org.apache.zookeeper.test.JMXEnv;
+
+/**
+ * A subclass of ZK's ClientBase testing utility, with some fixes
+ * necessary for running in the Hadoop context.
+ */
+public class ClientBaseWithFixes extends ClientBase {
+
+ /**
+ * When running on the Jenkins setup, we need to ensure that this
+ * build directory exists before running the tests.
+ */
+ @Override
+ public void setUp() throws Exception {
+ // build.test.dir is used by zookeeper
+ new File(System.getProperty("build.test.dir", "build")).mkdirs();
+ super.setUp();
+ }
+
+ /**
+ * ZK seems to have a bug when we muck with its sessions
+ * behind its back, causing disconnects, etc. This bug
+ * ends up leaving JMX beans around at the end of the test,
+ * and ClientBase's teardown method will throw an exception
+ * if it finds JMX beans leaked. So, clear them out there
+ * to workaround the ZK bug. See ZOOKEEPER-1438.
+ */
+ @Override
+ public void tearDown() throws Exception {
+ Set<ObjectName> names = JMXEnv.ensureAll();
+ for (ObjectName n : names) {
+ try {
+ JMXEnv.conn().unregisterMBean(n);
+ } catch (Throwable t) {
+ // ignore
+ }
+ }
+ }
+}
Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java Wed Apr 4 19:21:01 2012
@@ -389,6 +389,7 @@ public class TestActiveStandbyElector {
*/
@Test
public void testStatNodeRetry() {
+ elector.joinElection(data);
elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK,
(Stat) null);
elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK,
@@ -409,6 +410,7 @@ public class TestActiveStandbyElector {
*/
@Test
public void testStatNodeError() {
+ elector.joinElection(data);
elector.processResult(Code.RUNTIMEINCONSISTENCY.intValue(), ZK_LOCK_NAME,
mockZK, (Stat) null);
Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode();
@@ -592,6 +594,8 @@ public class TestActiveStandbyElector {
*/
@Test
public void testQuitElection() throws Exception {
+ elector.joinElection(data);
+ Mockito.verify(mockZK, Mockito.times(0)).close();
elector.quitElection(true);
Mockito.verify(mockZK, Mockito.times(1)).close();
// no watches added
Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java Wed Apr 4 19:21:01 2012
@@ -21,7 +21,6 @@ package org.apache.hadoop.ha;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
-import java.io.File;
import java.util.Collections;
import java.util.UUID;
@@ -32,7 +31,6 @@ import org.apache.hadoop.ha.HAZKUtil.ZKA
import org.apache.log4j.Level;
import org.apache.zookeeper.ZooDefs.Ids;
import org.apache.zookeeper.server.ZooKeeperServer;
-import org.apache.zookeeper.test.ClientBase;
import org.junit.Test;
import org.mockito.AdditionalMatchers;
import org.mockito.Mockito;
@@ -42,7 +40,7 @@ import com.google.common.primitives.Ints
/**
* Test for {@link ActiveStandbyElector} using real zookeeper.
*/
-public class TestActiveStandbyElectorRealZK extends ClientBase {
+public class TestActiveStandbyElectorRealZK extends ClientBaseWithFixes {
static final int NUM_ELECTORS = 2;
static {
@@ -61,8 +59,6 @@ public class TestActiveStandbyElectorRea
@Override
public void setUp() throws Exception {
- // build.test.dir is used by zookeeper
- new File(System.getProperty("build.test.dir", "build")).mkdirs();
super.setUp();
zkServer = getServer(serverFactory);
@@ -244,4 +240,19 @@ public class TestActiveStandbyElectorRea
checkFatalsAndReset();
}
+ @Test(timeout=15000)
+ public void testDontJoinElectionOnDisconnectAndReconnect() throws Exception {
+ electors[0].ensureParentZNode();
+
+ stopServer();
+ ActiveStandbyElectorTestUtil.waitForElectorState(
+ null, electors[0], State.NEUTRAL);
+ startServer();
+ waitForServerUp(hostPort, CONNECTION_TIMEOUT);
+ // Have to sleep to allow time for the clients to reconnect.
+ Thread.sleep(2000);
+ Mockito.verify(cbs[0], Mockito.never()).becomeActive();
+ Mockito.verify(cbs[1], Mockito.never()).becomeActive();
+ checkFatalsAndReset();
+ }
}
Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java Wed Apr 4 19:21:01 2012
@@ -19,7 +19,6 @@ package org.apache.hadoop.ha;
import static org.junit.Assert.*;
-import java.io.File;
import java.security.NoSuchAlgorithmException;
import org.apache.commons.logging.impl.Log4JLogger;
@@ -32,12 +31,11 @@ import org.apache.zookeeper.KeeperExcept
import org.apache.zookeeper.ZooKeeper;
import org.apache.zookeeper.data.Stat;
import org.apache.zookeeper.server.auth.DigestAuthenticationProvider;
-import org.apache.zookeeper.test.ClientBase;
import org.junit.Before;
import org.junit.Test;
import org.mockito.Mockito;
-public class TestZKFailoverController extends ClientBase {
+public class TestZKFailoverController extends ClientBaseWithFixes {
private Configuration conf;
private MiniZKFCCluster cluster;
@@ -63,13 +61,6 @@ public class TestZKFailoverController ex
((Log4JLogger)ActiveStandbyElector.LOG).getLogger().setLevel(Level.ALL);
}
- @Override
- public void setUp() throws Exception {
- // build.test.dir is used by zookeeper
- new File(System.getProperty("build.test.dir", "build")).mkdirs();
- super.setUp();
- }
-
@Before
public void setupConfAndServices() {
conf = new Configuration();
Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java Wed Apr 4 19:21:01 2012
@@ -17,15 +17,10 @@
*/
package org.apache.hadoop.ha;
-import java.io.File;
import java.util.Random;
-import java.util.Set;
-import javax.management.ObjectName;
import org.apache.hadoop.conf.Configuration;
-import org.apache.zookeeper.test.ClientBase;
-import org.apache.zookeeper.test.JMXEnv;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@@ -39,7 +34,7 @@ import org.mockito.stubbing.Answer;
* failovers. While doing so, ensures that a fake "shared resource"
* (simulating the shared edits dir) is only owned by one service at a time.
*/
-public class TestZKFailoverControllerStress extends ClientBase {
+public class TestZKFailoverControllerStress extends ClientBaseWithFixes {
private static final int STRESS_RUNTIME_SECS = 30;
private static final int EXTRA_TIMEOUT_SECS = 10;
@@ -47,13 +42,6 @@ public class TestZKFailoverControllerStr
private Configuration conf;
private MiniZKFCCluster cluster;
- @Override
- public void setUp() throws Exception {
- // build.test.dir is used by zookeeper
- new File(System.getProperty("build.test.dir", "build")).mkdirs();
- super.setUp();
- }
-
@Before
public void setupConfAndServices() throws Exception {
conf = new Configuration();
@@ -68,22 +56,6 @@ public class TestZKFailoverControllerStr
}
/**
- * ZK seems to have a bug when we muck with its sessions
- * behind its back, causing disconnects, etc. This bug
- * ends up leaving JMX beans around at the end of the test,
- * and ClientBase's teardown method will throw an exception
- * if it finds JMX beans leaked. So, clear them out there
- * to workaround the ZK bug. See ZOOKEEPER-1438.
- */
- @After
- public void clearZKJMX() throws Exception {
- Set<ObjectName> names = JMXEnv.ensureAll();
- for (ObjectName n : names) {
- JMXEnv.conn().unregisterMBean(n);
- }
- }
-
- /**
* Simply fail back and forth between two services for the
* configured amount of time, via expiring their ZK sessions.
*/