You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by to...@apache.org on 2012/04/04 21:21:02 UTC

svn commit: r1309554 - in /hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common: ./ src/main/java/org/apache/hadoop/ha/ src/test/java/org/apache/hadoop/ha/

Author: todd
Date: Wed Apr  4 19:21:01 2012
New Revision: 1309554

URL: http://svn.apache.org/viewvc?rev=1309554&view=rev
Log:
HADOOP-8245. Fix flakiness in TestZKFailoverController. Contributed by Todd Lipcon.

Added:
    hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java
Modified:
    hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt
    hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
    hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java
    hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java
    hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java
    hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java
    hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java

Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt Wed Apr  4 19:21:01 2012
@@ -9,3 +9,5 @@ HADOOP-8220. ZKFailoverController doesn'
 HADOOP-8228. Auto HA: Refactor tests and add stress tests. (todd)
 
 HADOOP-8215. Security support for ZK Failover controller (todd)
+
+HADOOP-8245. Fix flakiness in TestZKFailoverController (todd)

Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java Wed Apr  4 19:21:01 2012
@@ -240,8 +240,6 @@ public class ActiveStandbyElector implem
   public synchronized void joinElection(byte[] data)
       throws HadoopIllegalArgumentException {
     
-    LOG.debug("Attempting active election");
-
     if (data == null) {
       throw new HadoopIllegalArgumentException("data cannot be null");
     }
@@ -249,6 +247,7 @@ public class ActiveStandbyElector implem
     appData = new byte[data.length];
     System.arraycopy(data, 0, appData, 0, data.length);
 
+    LOG.debug("Attempting active election for " + this);
     joinElectionInternal();
   }
   
@@ -272,6 +271,9 @@ public class ActiveStandbyElector implem
    */
   public synchronized void ensureParentZNode()
       throws IOException, InterruptedException {
+    Preconditions.checkState(!wantToBeInElection,
+        "ensureParentZNode() may not be called while in the election");
+
     String pathParts[] = znodeWorkingDir.split("/");
     Preconditions.checkArgument(pathParts.length >= 1 &&
         "".equals(pathParts[0]),
@@ -305,6 +307,9 @@ public class ActiveStandbyElector implem
    */
   public synchronized void clearParentZNode()
       throws IOException, InterruptedException {
+    Preconditions.checkState(!wantToBeInElection,
+        "clearParentZNode() may not be called while in the election");
+
     try {
       LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK...");
 
@@ -393,7 +398,8 @@ public class ActiveStandbyElector implem
       String name) {
     if (isStaleClient(ctx)) return;
     LOG.debug("CreateNode result: " + rc + " for path: " + path
-        + " connectionState: " + zkConnectionState);
+        + " connectionState: " + zkConnectionState +
+        "  for " + this);
 
     Code code = Code.get(rc);
     if (isSuccess(code)) {
@@ -449,8 +455,13 @@ public class ActiveStandbyElector implem
   public synchronized void processResult(int rc, String path, Object ctx,
       Stat stat) {
     if (isStaleClient(ctx)) return;
+    
+    assert wantToBeInElection :
+        "Got a StatNode result after quitting election";
+    
     LOG.debug("StatNode result: " + rc + " for path: " + path
-        + " connectionState: " + zkConnectionState);
+        + " connectionState: " + zkConnectionState + " for " + this);
+        
 
     Code code = Code.get(rc);
     if (isSuccess(code)) {
@@ -517,7 +528,8 @@ public class ActiveStandbyElector implem
     if (isStaleClient(zk)) return;
     LOG.debug("Watcher event type: " + eventType + " with state:"
         + event.getState() + " for path:" + event.getPath()
-        + " connectionState: " + zkConnectionState);
+        + " connectionState: " + zkConnectionState
+        + " for " + this);
 
     if (eventType == Event.EventType.None) {
       // the connection state has changed
@@ -528,7 +540,8 @@ public class ActiveStandbyElector implem
         // be undone
         ConnectionState prevConnectionState = zkConnectionState;
         zkConnectionState = ConnectionState.CONNECTED;
-        if (prevConnectionState == ConnectionState.DISCONNECTED) {
+        if (prevConnectionState == ConnectionState.DISCONNECTED &&
+            wantToBeInElection) {
           monitorActiveStatus();
         }
         break;
@@ -600,12 +613,14 @@ public class ActiveStandbyElector implem
   }
 
   private void fatalError(String errorMessage) {
+    LOG.fatal(errorMessage);
     reset();
     appClient.notifyFatalError(errorMessage);
   }
 
   private void monitorActiveStatus() {
-    LOG.debug("Monitoring active leader");
+    assert wantToBeInElection;
+    LOG.debug("Monitoring active leader for " + this);
     statRetryCount = 0;
     monitorLockNodeAsync();
   }
@@ -688,7 +703,7 @@ public class ActiveStandbyElector implem
     int connectionRetryCount = 0;
     boolean success = false;
     while(!success && connectionRetryCount < NUM_RETRIES) {
-      LOG.debug("Establishing zookeeper connection");
+      LOG.debug("Establishing zookeeper connection for " + this);
       try {
         createConnection();
         success = true;
@@ -703,13 +718,14 @@ public class ActiveStandbyElector implem
 
   private void createConnection() throws IOException {
     zkClient = getNewZooKeeper();
+    LOG.debug("Created new connection for " + this);
   }
   
   private void terminateConnection() {
     if (zkClient == null) {
       return;
     }
-    LOG.debug("Terminating ZK connection");
+    LOG.debug("Terminating ZK connection for " + this);
     ZooKeeper tempZk = zkClient;
     zkClient = null;
     try {
@@ -735,7 +751,7 @@ public class ActiveStandbyElector implem
       Stat oldBreadcrumbStat = fenceOldActive();
       writeBreadCrumbNode(oldBreadcrumbStat);
       
-      LOG.debug("Becoming active");
+      LOG.debug("Becoming active for " + this);
       appClient.becomeActive();
       state = State.ACTIVE;
       return true;
@@ -838,7 +854,7 @@ public class ActiveStandbyElector implem
 
   private void becomeStandby() {
     if (state != State.STANDBY) {
-      LOG.debug("Becoming standby");
+      LOG.debug("Becoming standby for " + this);
       state = State.STANDBY;
       appClient.becomeStandby();
     }
@@ -846,7 +862,7 @@ public class ActiveStandbyElector implem
 
   private void enterNeutralMode() {
     if (state != State.NEUTRAL) {
-      LOG.debug("Entering neutral mode");
+      LOG.debug("Entering neutral mode for " + this);
       state = State.NEUTRAL;
       appClient.enterNeutralMode();
     }
@@ -943,8 +959,14 @@ public class ActiveStandbyElector implem
 
     @Override
     public void process(WatchedEvent event) {
-      ActiveStandbyElector.this.processWatchEvent(
-          zk, event);
+      try {
+        ActiveStandbyElector.this.processWatchEvent(
+            zk, event);
+      } catch (Throwable t) {
+        fatalError(
+            "Failed to process watcher event " + event + ": " +
+            StringUtils.stringifyException(t));
+      }
     }
   }
 
@@ -972,5 +994,13 @@ public class ActiveStandbyElector implem
     }
     return false;
   }
+  
+  @Override
+  public String toString() {
+    return "elector id=" + System.identityHashCode(this) +
+      " appData=" +
+      ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) + 
+      " cb=" + appClient;
+  }
 
 }

Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java Wed Apr  4 19:21:01 2012
@@ -154,6 +154,7 @@ public abstract class ZKFailoverControll
     try {
       mainLoop();
     } finally {
+      elector.quitElection(true);
       healthMonitor.shutdown();
       healthMonitor.join();
     }
@@ -379,6 +380,11 @@ public abstract class ZKFailoverControll
         throw new RuntimeException("Unable to fence " + target);
       }
     }
+    
+    @Override
+    public String toString() {
+      return "Elector callbacks for " + localTarget;
+    }
   }
   
   /**

Added: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java?rev=1309554&view=auto
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java (added)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java Wed Apr  4 19:21:01 2012
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.io.File;
+import java.util.Set;
+
+import javax.management.ObjectName;
+
+import org.apache.zookeeper.test.ClientBase;
+import org.apache.zookeeper.test.JMXEnv;
+
+/**
+ * A subclass of ZK's ClientBase testing utility, with some fixes
+ * necessary for running in the Hadoop context.
+ */
+public class ClientBaseWithFixes extends ClientBase {
+
+  /**
+   * When running on the Jenkins setup, we need to ensure that this
+   * build directory exists before running the tests.
+   */
+  @Override
+  public void setUp() throws Exception {
+    // build.test.dir is used by zookeeper
+    new File(System.getProperty("build.test.dir", "build")).mkdirs();
+    super.setUp();
+  }
+  
+  /**
+   * ZK seems to have a bug when we muck with its sessions
+   * behind its back, causing disconnects, etc. This bug
+   * ends up leaving JMX beans around at the end of the test,
+   * and ClientBase's teardown method will throw an exception
+   * if it finds JMX beans leaked. So, clear them out there
+   * to workaround the ZK bug. See ZOOKEEPER-1438.
+   */
+  @Override
+  public void tearDown() throws Exception {
+    Set<ObjectName> names = JMXEnv.ensureAll();
+    for (ObjectName n : names) {
+      try {
+        JMXEnv.conn().unregisterMBean(n);
+      } catch (Throwable t) {
+        // ignore
+      }
+    }
+  }
+}

Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java Wed Apr  4 19:21:01 2012
@@ -389,6 +389,7 @@ public class TestActiveStandbyElector {
    */
   @Test
   public void testStatNodeRetry() {
+    elector.joinElection(data);
     elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK,
         (Stat) null);
     elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK,
@@ -409,6 +410,7 @@ public class TestActiveStandbyElector {
    */
   @Test
   public void testStatNodeError() {
+    elector.joinElection(data);
     elector.processResult(Code.RUNTIMEINCONSISTENCY.intValue(), ZK_LOCK_NAME,
         mockZK, (Stat) null);
     Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode();
@@ -592,6 +594,8 @@ public class TestActiveStandbyElector {
    */
   @Test
   public void testQuitElection() throws Exception {
+    elector.joinElection(data);
+    Mockito.verify(mockZK, Mockito.times(0)).close();
     elector.quitElection(true);
     Mockito.verify(mockZK, Mockito.times(1)).close();
     // no watches added

Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java Wed Apr  4 19:21:01 2012
@@ -21,7 +21,6 @@ package org.apache.hadoop.ha;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
-import java.io.File;
 import java.util.Collections;
 import java.util.UUID;
 
@@ -32,7 +31,6 @@ import org.apache.hadoop.ha.HAZKUtil.ZKA
 import org.apache.log4j.Level;
 import org.apache.zookeeper.ZooDefs.Ids;
 import org.apache.zookeeper.server.ZooKeeperServer;
-import org.apache.zookeeper.test.ClientBase;
 import org.junit.Test;
 import org.mockito.AdditionalMatchers;
 import org.mockito.Mockito;
@@ -42,7 +40,7 @@ import com.google.common.primitives.Ints
 /**
  * Test for {@link ActiveStandbyElector} using real zookeeper.
  */
-public class TestActiveStandbyElectorRealZK extends ClientBase {
+public class TestActiveStandbyElectorRealZK extends ClientBaseWithFixes {
   static final int NUM_ELECTORS = 2;
   
   static {
@@ -61,8 +59,6 @@ public class TestActiveStandbyElectorRea
   
   @Override
   public void setUp() throws Exception {
-    // build.test.dir is used by zookeeper
-    new File(System.getProperty("build.test.dir", "build")).mkdirs();
     super.setUp();
     
     zkServer = getServer(serverFactory);
@@ -244,4 +240,19 @@ public class TestActiveStandbyElectorRea
     checkFatalsAndReset();
   }
 
+  @Test(timeout=15000)
+  public void testDontJoinElectionOnDisconnectAndReconnect() throws Exception {
+    electors[0].ensureParentZNode();
+
+    stopServer();
+    ActiveStandbyElectorTestUtil.waitForElectorState(
+        null, electors[0], State.NEUTRAL);
+    startServer();
+    waitForServerUp(hostPort, CONNECTION_TIMEOUT);
+    // Have to sleep to allow time for the clients to reconnect.
+    Thread.sleep(2000);
+    Mockito.verify(cbs[0], Mockito.never()).becomeActive();
+    Mockito.verify(cbs[1], Mockito.never()).becomeActive();
+    checkFatalsAndReset();
+  }
 }

Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java Wed Apr  4 19:21:01 2012
@@ -19,7 +19,6 @@ package org.apache.hadoop.ha;
 
 import static org.junit.Assert.*;
 
-import java.io.File;
 import java.security.NoSuchAlgorithmException;
 
 import org.apache.commons.logging.impl.Log4JLogger;
@@ -32,12 +31,11 @@ import org.apache.zookeeper.KeeperExcept
 import org.apache.zookeeper.ZooKeeper;
 import org.apache.zookeeper.data.Stat;
 import org.apache.zookeeper.server.auth.DigestAuthenticationProvider;
-import org.apache.zookeeper.test.ClientBase;
 import org.junit.Before;
 import org.junit.Test;
 import org.mockito.Mockito;
 
-public class TestZKFailoverController extends ClientBase {
+public class TestZKFailoverController extends ClientBaseWithFixes {
   private Configuration conf;
   private MiniZKFCCluster cluster;
   
@@ -63,13 +61,6 @@ public class TestZKFailoverController ex
     ((Log4JLogger)ActiveStandbyElector.LOG).getLogger().setLevel(Level.ALL);
   }
   
-  @Override
-  public void setUp() throws Exception {
-    // build.test.dir is used by zookeeper
-    new File(System.getProperty("build.test.dir", "build")).mkdirs();
-    super.setUp();
-  }
-  
   @Before
   public void setupConfAndServices() {
     conf = new Configuration();

Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java?rev=1309554&r1=1309553&r2=1309554&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java (original)
+++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java Wed Apr  4 19:21:01 2012
@@ -17,15 +17,10 @@
  */
 package org.apache.hadoop.ha;
 
-import java.io.File;
 import java.util.Random;
-import java.util.Set;
 
-import javax.management.ObjectName;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.zookeeper.test.ClientBase;
-import org.apache.zookeeper.test.JMXEnv;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -39,7 +34,7 @@ import org.mockito.stubbing.Answer;
  * failovers. While doing so, ensures that a fake "shared resource"
  * (simulating the shared edits dir) is only owned by one service at a time. 
  */
-public class TestZKFailoverControllerStress extends ClientBase {
+public class TestZKFailoverControllerStress extends ClientBaseWithFixes {
   
   private static final int STRESS_RUNTIME_SECS = 30;
   private static final int EXTRA_TIMEOUT_SECS = 10;
@@ -47,13 +42,6 @@ public class TestZKFailoverControllerStr
   private Configuration conf;
   private MiniZKFCCluster cluster;
 
-  @Override
-  public void setUp() throws Exception {
-    // build.test.dir is used by zookeeper
-    new File(System.getProperty("build.test.dir", "build")).mkdirs();
-    super.setUp();
-  }
-  
   @Before
   public void setupConfAndServices() throws Exception {
     conf = new Configuration();
@@ -68,22 +56,6 @@ public class TestZKFailoverControllerStr
   }
 
   /**
-   * ZK seems to have a bug when we muck with its sessions
-   * behind its back, causing disconnects, etc. This bug
-   * ends up leaving JMX beans around at the end of the test,
-   * and ClientBase's teardown method will throw an exception
-   * if it finds JMX beans leaked. So, clear them out there
-   * to workaround the ZK bug. See ZOOKEEPER-1438.
-   */
-  @After
-  public void clearZKJMX() throws Exception {
-    Set<ObjectName> names = JMXEnv.ensureAll();
-    for (ObjectName n : names) {
-      JMXEnv.conn().unregisterMBean(n);
-    }
-  }
-
-  /**
    * Simply fail back and forth between two services for the
    * configured amount of time, via expiring their ZK sessions.
    */