You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2012/01/04 19:28:20 UTC

svn commit: r1227255 - in /lucene/dev/branches/solrcloud/solr: core/src/java/org/apache/solr/client/solrj/embedded/ core/src/java/org/apache/solr/cloud/ core/src/java/org/apache/solr/update/ core/src/test/org/apache/solr/cloud/ solrj/src/java/org/apach...

Author: markrmiller
Date: Wed Jan  4 18:28:20 2012
New Revision: 1227255

URL: http://svn.apache.org/viewvc?rev=1227255&view=rev
Log:
harden leader election against connection loss - also add infra for doing connection loss and expiration testing with ChaosMonkey

Added:
    lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkCmdExecutor.java   (with props)
    lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZooKeeperOperation.java   (with props)
Modified:
    lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
    lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
    lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
    lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
    lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java
    lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySolrCloudTest.java
    lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java
    lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java
    lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/RecoveryZkTest.java
    lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkTestServer.java
    lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java
    lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/zookeeper/SolrZooKeeper.java

Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java Wed Jan  4 18:28:20 2012
@@ -34,6 +34,7 @@ import org.mortbay.jetty.servlet.Context
 import org.mortbay.jetty.servlet.FilterHolder;
 import org.mortbay.jetty.servlet.HashSessionIdManager;
 import org.mortbay.log.Logger;
+import org.mortbay.thread.QueuedThreadPool;
 
 /**
  * Run solr using jetty
@@ -94,8 +95,22 @@ public class JettySolrRunner {
       SocketConnector connector = new SocketConnector();
       connector.setPort(port);
       connector.setReuseAddress(true);
+      QueuedThreadPool threadPool = (QueuedThreadPool) connector.getThreadPool();
+      if (threadPool != null) {
+        threadPool.setMaxStopTimeMs(100);
+      }
       server.setConnectors(new Connector[] { connector });
       server.setSessionIdManager(new HashSessionIdManager(new Random()));
+    } else {
+      for (Connector connector : server.getConnectors()) {
+        if (connector instanceof SocketConnector) {
+          QueuedThreadPool threadPool = (QueuedThreadPool) ((SocketConnector) connector)
+              .getThreadPool();
+          if (threadPool != null) {
+            threadPool.setMaxStopTimeMs(100);
+          }
+        }
+      }
     }
 
     // Initialize the servlets

Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java Wed Jan  4 18:28:20 2012
@@ -44,19 +44,28 @@ public abstract class ElectionContext {
 final class ShardLeaderElectionContext extends ElectionContext {
   
   private final SolrZkClient zkClient;
+  private ZkCmdExecutor proto;
   public ShardLeaderElectionContext(final String shardId,
       final String collection, final String shardZkNodeName, ZkNodeProps props, SolrZkClient zkClient) {
     super(shardZkNodeName, ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/leader_elect/"
         + shardId, ZkStateReader.getShardLeadersPath(collection, shardId),
         props);
     this.zkClient = zkClient;
+    this.proto = new ZkCmdExecutor(zkClient);
   }
 
   @Override
   void runLeaderProcess() throws KeeperException, InterruptedException {
-    String currentLeaderZkPath = leaderPath;
-    zkClient.makePath(currentLeaderZkPath, leaderProps == null ? null
-        : ZkStateReader.toJSON(leaderProps), CreateMode.EPHEMERAL);
+    proto.retryOperation(new ZooKeeperOperation() {
+      
+      @Override
+      public Object execute() throws KeeperException, InterruptedException {
+        zkClient.makePath(leaderPath, leaderProps == null ? null
+            : ZkStateReader.toJSON(leaderProps), CreateMode.EPHEMERAL);
+        return null;
+      }
+    });
+
   }
 }
 

Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java Wed Jan  4 18:28:20 2012
@@ -31,6 +31,7 @@ import org.apache.solr.common.cloud.Solr
 import org.apache.solr.common.cloud.ZooKeeperException;
 import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.KeeperException.ConnectionLossException;
 import org.apache.zookeeper.KeeperException.NodeExistsException;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
@@ -57,12 +58,16 @@ public  class LeaderElector {
   
   private static final String ELECTION_NODE = "/election";
   
-  private final static Pattern LEADER_SEQ = Pattern.compile(".*?/?n_(\\d+)");
+  private final static Pattern LEADER_SEQ = Pattern.compile(".*?/?.*?-n_(\\d+)");
+  private final static Pattern SESSION_ID = Pattern.compile(".*?/?(.*?)-n_\\d+");
+  
+  private ZkCmdExecutor cmdExecutor;
   
   protected SolrZkClient zkClient;
   
   public LeaderElector(SolrZkClient zkClient) {
     this.zkClient = zkClient;
+    cmdExecutor = new ZkCmdExecutor(zkClient);
   }
   
   /**
@@ -81,8 +86,16 @@ public  class LeaderElector {
   private void checkIfIamLeader(final int seq, final ElectionContext context) throws KeeperException,
       InterruptedException, IOException {
     // get all other numbers...
-    String holdElectionPath = context.electionPath + ELECTION_NODE;
-    List<String> seqs = zkClient.getChildren(holdElectionPath, null);
+    final String holdElectionPath = context.electionPath + ELECTION_NODE;
+    List<String> seqs = cmdExecutor.retryOperation(new ZooKeeperOperation() {
+      
+      @Override
+      public Object execute() throws KeeperException, InterruptedException {
+         return zkClient.getChildren(holdElectionPath, null);
+      }
+    });
+    
+    
     sortSeqs(seqs);
     List<Integer> intSeqs = getSeqs(seqs);
     if (seq <= intSeqs.get(0)) {
@@ -151,6 +164,18 @@ public  class LeaderElector {
     return seq;
   }
   
+  private long getSessionId(String nStringSequence) {
+    long id = 0;
+    Matcher m = SESSION_ID.matcher(nStringSequence);
+    if (m.matches()) {
+      id = Long.parseLong(m.group(1));
+    } else {
+      throw new IllegalStateException("Could not find regex match in:"
+          + nStringSequence);
+    }
+    return id;
+  }
+  
   /**
    * Returns int list given list of form n_0000000001, n_0000000003, etc.
    * 
@@ -182,15 +207,38 @@ public  class LeaderElector {
   public int joinElection(ElectionContext context) throws KeeperException, InterruptedException, IOException {
     final String shardsElectZkPath = context.electionPath + LeaderElector.ELECTION_NODE;
     
+    long id = zkClient.getSolrZooKeeper().getSessionId();
     String leaderSeqPath = null;
     boolean cont = true;
     int tries = 0;
     while (cont) {
       try {
-        
-        leaderSeqPath = zkClient.create(shardsElectZkPath + "/n_", null,
+        leaderSeqPath = zkClient.create(shardsElectZkPath + "/" + id + "-n_", null,
             CreateMode.EPHEMERAL_SEQUENTIAL);
         cont = false;
+      } catch (ConnectionLossException e) {
+        // we don't know if we made our node or not...
+        List<String> entries = cmdExecutor.retryOperation(new ZooKeeperOperation() {
+          
+          @Override
+          public Object execute() throws KeeperException, InterruptedException {
+             return zkClient.getChildren(shardsElectZkPath, null);
+          }
+        });
+        
+        boolean foundId = false;
+        for (String entry : entries) {
+          long nodeId = getSessionId(entry);
+          if (id == nodeId) {
+            // we did create our node...
+            foundId  = true;
+            break;
+          }
+        }
+        if (!foundId) {
+          throw e;
+        }
+
       } catch (KeeperException.NoNodeException e) {
         // we must have failed in creating the election node - someone else must
         // be working on it, lets try again
@@ -224,7 +272,7 @@ public  class LeaderElector {
     try {
       
       // leader election node
-      if (!zkClient.exists(electZKPath)) {
+      if (!zkClient.exists(electZKPath)) { // on connection loss we throw out an exception
         
         // make new leader election node
         zkClient.makePath(electZKPath, CreateMode.PERSISTENT, null);

Added: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkCmdExecutor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkCmdExecutor.java?rev=1227255&view=auto
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkCmdExecutor.java (added)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkCmdExecutor.java Wed Jan  4 18:28:20 2012
@@ -0,0 +1,153 @@
+package org.apache.solr.cloud;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.List;
+
+import org.apache.log4j.Logger;
+import org.apache.solr.common.cloud.SolrZkClient;
+import org.apache.zookeeper.CreateMode;
+import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.ZooDefs;
+import org.apache.zookeeper.data.ACL;
+
+
+public class ZkCmdExecutor {
+  private static final Logger LOG = Logger.getLogger(ZkCmdExecutor.class);
+  
+  protected final SolrZkClient solrZkClient;
+  private long retryDelay = 1000L;
+  private int retryCount = 15;
+  private List<ACL> acl = ZooDefs.Ids.OPEN_ACL_UNSAFE;
+  
+  public ZkCmdExecutor(SolrZkClient solrZkClient) {
+    this.solrZkClient = solrZkClient;
+  }
+  
+  /**
+   * return the acl its using
+   * 
+   * @return the acl.
+   */
+  public List<ACL> getAcl() {
+    return acl;
+  }
+  
+  /**
+   * set the acl
+   * 
+   * @param acl
+   *          the acl to set to
+   */
+  public void setAcl(List<ACL> acl) {
+    this.acl = acl;
+  }
+  
+  /**
+   * get the retry delay in milliseconds
+   * 
+   * @return the retry delay
+   */
+  public long getRetryDelay() {
+    return retryDelay;
+  }
+  
+  /**
+   * Sets the time waited between retry delays
+   * 
+   * @param retryDelay
+   *          the retry delay
+   */
+  public void setRetryDelay(long retryDelay) {
+    this.retryDelay = retryDelay;
+  }
+  
+  /**
+   * Perform the given operation, retrying if the connection fails
+   * 
+   * @return object. it needs to be cast to the callee's expected return type.
+   */
+  @SuppressWarnings("unchecked")
+  protected <T> T retryOperation(ZooKeeperOperation operation)
+      throws KeeperException, InterruptedException {
+    KeeperException exception = null;
+    for (int i = 0; i < retryCount; i++) {
+      try {
+        return (T) operation.execute();
+      } catch (KeeperException.ConnectionLossException e) {
+        if (exception == null) {
+          exception = e;
+        }
+        retryDelay(i);
+      }
+    }
+    throw exception;
+  }
+  
+  /**
+   * Ensures that the given path exists with no data, the current ACL and no
+   * flags
+   * 
+   * @param path
+   */
+  protected void ensurePathExists(String path) {
+    ensureExists(path, null, acl, CreateMode.PERSISTENT);
+  }
+  
+  /**
+   * Ensures that the given path exists with the given data, ACL and flags
+   * 
+   * @param path
+   * @param acl
+   * @param flags
+   */
+  protected void ensureExists(final String path, final byte[] data,
+      final List<ACL> acl, final CreateMode flags) {
+    try {
+      retryOperation(new ZooKeeperOperation() {
+        public Object execute() throws KeeperException, InterruptedException {
+          if (solrZkClient.exists(path)) {
+            return true;
+          }
+          solrZkClient.create(path, data, acl, flags);
+          return true;
+        }
+      });
+    } catch (KeeperException e) {
+      LOG.warn("", e);
+    } catch (InterruptedException e) {
+      LOG.warn("", e);
+    }
+  }
+  
+  /**
+   * Performs a retry delay if this is not the first attempt
+   * 
+   * @param attemptCount
+   *          the number of the attempts performed so far
+   */
+  protected void retryDelay(int attemptCount) {
+    if (attemptCount > 0) {
+      try {
+        Thread.sleep(attemptCount * retryDelay);
+      } catch (InterruptedException e) {
+        LOG.debug("Failed to sleep: " + e, e);
+      }
+    }
+  }
+}

Added: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZooKeeperOperation.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZooKeeperOperation.java?rev=1227255&view=auto
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZooKeeperOperation.java (added)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZooKeeperOperation.java Wed Jan  4 18:28:20 2012
@@ -0,0 +1,38 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.cloud;
+
+import org.apache.zookeeper.KeeperException;
+
+/**
+ * A callback object which can be used for implementing retry-able operations in the 
+ * {@link org.apache.zookeeper.ZkCmdExecutor.lock.ProtocolSupport} class
+ *
+ */
+public interface ZooKeeperOperation {
+    
+    /**
+     * Performs the operation - which may be involved multiple times if the connection
+     * to ZooKeeper closes during this operation
+     *
+     * @return the result of the operation or null
+     * @throws KeeperException
+     * @throws InterruptedException
+     */
+    public Object execute() throws KeeperException, InterruptedException;
+}

Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java Wed Jan  4 18:28:20 2012
@@ -363,7 +363,7 @@ public class SolrCmdDistributor {
               response.errors.add(error);
               response.sreq = sreq;
               SolrException.logOnce(SolrCore.log, "shard update error "
-                  + sreq.node + " (" + sreq.node + ")", sreq.exception);
+                  + sreq.node, sreq.exception);
             }
           }
           
@@ -419,7 +419,7 @@ public class SolrCmdDistributor {
     
     @Override
     public String toString() {
-      return url;
+      return this.getClass().getSimpleName() + ": " + url;
     }
 
     @Override

Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java Wed Jan  4 18:28:20 2012
@@ -24,11 +24,14 @@ import java.util.Map;
 import java.util.Random;
 import java.util.concurrent.atomic.AtomicInteger;
 
+import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.embedded.JettySolrRunner;
 import org.apache.solr.cloud.FullSolrCloudTest.CloudJettyRunner;
 import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
+import org.apache.solr.core.CoreContainer;
 import org.apache.solr.servlet.SolrDispatchFilter;
 import org.apache.zookeeper.KeeperException;
 import org.mortbay.jetty.servlet.FilterHolder;
@@ -43,6 +46,8 @@ import org.mortbay.jetty.servlet.FilterH
 public class ChaosMonkey {
 
   private static final boolean DONTKILLLEADER = true;
+  protected static final boolean EXPIRE_SESSIONS = false;
+  protected static final boolean CAUSE_CONNECTION_LOSS = false;
   private Map<String,List<CloudJettyRunner>> shardToJetty;
   private ZkTestServer zkServer;
   private ZkStateReader zkStateReader;
@@ -52,11 +57,15 @@ public class ChaosMonkey {
   private AtomicInteger stops = new AtomicInteger();
   private AtomicInteger starts = new AtomicInteger();
   private AtomicInteger expires = new AtomicInteger();
+  private AtomicInteger connloss = new AtomicInteger();
+  
+  private Map<String,List<SolrServer>> shardToClient;
   
   public ChaosMonkey(ZkTestServer zkServer, ZkStateReader zkStateReader,
       String collection, Map<String,List<CloudJettyRunner>> shardToJetty,
-      Random random) {
+      Map<String,List<SolrServer>> shardToClient, Random random) {
     this.shardToJetty = shardToJetty;
+    this.shardToClient = shardToClient;
     this.zkServer = zkServer;
     this.zkStateReader = zkStateReader;
     this.collection = collection;
@@ -70,17 +79,37 @@ public class ChaosMonkey {
   }
   
   public void expireRandomSession() throws KeeperException, InterruptedException {
-    Map<String,Slice> slices = zkStateReader.getCloudState().getSlices(collection);
-    List<String> sliceKeyList = new ArrayList<String>(slices.size());
-    sliceKeyList.addAll(slices.keySet());
-    String sliceName = sliceKeyList.get(random.nextInt(sliceKeyList.size()));
+    String sliceName = getRandomSlice();
     
-    JettySolrRunner jetty = getRandomSacraficialShard(sliceName, DONTKILLLEADER);
+    JettySolrRunner jetty = getRandomSacraficialJetty(sliceName, DONTKILLLEADER);
     if (jetty != null) {
       expireSession(jetty);
     }
   }
   
+  public void randomConnectionLoss() throws KeeperException, InterruptedException {
+    String sliceName = getRandomSlice();
+    
+    JettySolrRunner jetty = getRandomSacraficialJetty(sliceName, DONTKILLLEADER);
+    if (jetty != null) {
+      System.out.println("cause connection loss");
+      causeConnectionLoss(jetty);
+    }
+  }
+  
+  private void causeConnectionLoss(JettySolrRunner jetty) {
+    SolrDispatchFilter solrDispatchFilter = (SolrDispatchFilter) jetty
+        .getDispatchFilter().getFilter();
+    if (solrDispatchFilter != null) {
+      CoreContainer cores = solrDispatchFilter.getCores();
+      if (cores != null) {
+        SolrZkClient zkClient = cores.getZkController().getZkClient();
+        // over double tick time...
+        zkClient.getSolrZooKeeper().pauseCnxn(ZkTestServer.TICK_TIME * 3);
+      }
+    }
+  }
+
   public JettySolrRunner stopShard(String slice, int index) throws Exception {
     JettySolrRunner jetty = shardToJetty.get(slice).get(index).jetty;
     stopJetty(jetty);
@@ -163,18 +192,13 @@ public class ChaosMonkey {
   }
   
   public JettySolrRunner stopRandomShard() throws Exception {
-    // add all the shards to a list
-    Map<String,Slice> slices = zkStateReader.getCloudState().getSlices(collection);
-    
-    List<String> sliceKeyList = new ArrayList<String>(slices.size());
-    sliceKeyList.addAll(slices.keySet());
-    String sliceName = sliceKeyList.get(random.nextInt(sliceKeyList.size()));
+    String sliceName = getRandomSlice();
     
     return stopRandomShard(sliceName);
   }
   
   public JettySolrRunner stopRandomShard(String slice) throws Exception {
-    JettySolrRunner jetty = getRandomSacraficialShard(slice, DONTKILLLEADER);
+    JettySolrRunner jetty = getRandomSacraficialJetty(slice, DONTKILLLEADER);
     if (jetty != null) {
       stopJetty(jetty);
     }
@@ -184,24 +208,29 @@ public class ChaosMonkey {
   
   public JettySolrRunner killRandomShard() throws Exception {
     // add all the shards to a list
+    String sliceName = getRandomSlice();
+    
+    return killRandomShard(sliceName);
+  }
+
+  private String getRandomSlice() {
     Map<String,Slice> slices = zkStateReader.getCloudState().getSlices(collection);
     
     List<String> sliceKeyList = new ArrayList<String>(slices.size());
     sliceKeyList.addAll(slices.keySet());
     String sliceName = sliceKeyList.get(random.nextInt(sliceKeyList.size()));
-    
-    return killRandomShard(sliceName);
+    return sliceName;
   }
   
   public JettySolrRunner killRandomShard(String slice) throws Exception {
-    JettySolrRunner jetty = getRandomSacraficialShard(slice, DONTKILLLEADER);
+    JettySolrRunner jetty = getRandomSacraficialJetty(slice, DONTKILLLEADER);
     if (jetty != null) {
       killJetty(jetty);
     }
     return jetty;
   }
   
-  public JettySolrRunner getRandomSacraficialShard(String slice, boolean dontkillleader) throws KeeperException, InterruptedException {
+  public JettySolrRunner getRandomSacraficialJetty(String slice, boolean dontkillleader) throws KeeperException, InterruptedException {
     // get latest cloud state
     zkStateReader.updateCloudState(true);
     Slice theShards = zkStateReader.getCloudState().getSlices(collection)
@@ -307,9 +336,13 @@ public class ChaosMonkey {
             
             int rnd = random.nextInt(10);
             // nocommit: we dont randomly expire yet
-            if (false && rnd < 2) {
+            if (EXPIRE_SESSIONS && rnd < 2) {
               expireRandomSession();
               expires.incrementAndGet();
+            } else if (CAUSE_CONNECTION_LOSS && rnd < 4) {
+              
+              randomConnectionLoss();
+              connloss.incrementAndGet();
             } else {
               JettySolrRunner jetty;
               if (random.nextBoolean()) {
@@ -332,7 +365,8 @@ public class ChaosMonkey {
         }
         
         System.out.println("I stopped " + stops + " and I started " + starts
-            + ". I also expired " + expires.get());
+            + ". I also expired " + expires.get() + " and caused " + connloss
+            + " connection losses");
       }
     }.start();
   }

Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySolrCloudTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySolrCloudTest.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySolrCloudTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySolrCloudTest.java Wed Jan  4 18:28:20 2012
@@ -22,13 +22,21 @@ import java.util.List;
 
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.common.SolrInputDocument;
+import org.junit.AfterClass;
 import org.junit.BeforeClass;
 
 public class ChaosMonkeySolrCloudTest extends FullSolrCloudTest {
   
   @BeforeClass
   public static void beforeSuperClass() throws Exception {
-    
+    // we expect this time of exception as shards go up and down...
+    ignoreException("shard update error ");
+    ignoreException("Connection refused");
+  }
+  
+  @AfterClass
+  public static void afterSuperClass() throws Exception {
+    resetExceptionIgnores();
   }
   
   public ChaosMonkeySolrCloudTest() {
@@ -53,7 +61,7 @@ public class ChaosMonkeySolrCloudTest ex
     
     chaosMonkey.startTheMonkey();
     
-    Thread.sleep(16000);
+    Thread.sleep(48000);
     
     chaosMonkey.stopTheMonkey();
     
@@ -82,8 +90,7 @@ public class ChaosMonkeySolrCloudTest ex
     //assertEquals(chaosMonkey.getStarts(), getNumberOfRecoveryAttempts() - shardCount - sliceCount);
     
     
-    // does not always pass yet
-    checkShardConsistency(true);
+    checkShardConsistency(false);
     
     if (VERBOSE) System.out.println("control docs:" + controlClient.query(new SolrQuery("*:*")).getResults().getNumFound() + "\n\n");
   }

Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java Wed Jan  4 18:28:20 2012
@@ -178,7 +178,7 @@ public class FullSolrCloudTest extends A
         zkStateReader.createClusterStateWatchersAndUpdate();
       }
       
-      chaosMonkey = new ChaosMonkey(zkServer, zkStateReader, DEFAULT_COLLECTION, shardToJetty, random);
+      chaosMonkey = new ChaosMonkey(zkServer, zkStateReader, DEFAULT_COLLECTION, shardToJetty, shardToClient, random);
     }
     
     // wait until shards have started registering...

Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java Wed Jan  4 18:28:20 2012
@@ -41,12 +41,14 @@ import org.junit.Test;
 
 public class LeaderElectionTest extends SolrTestCaseJ4 {
   
-  static final int TIMEOUT = 10000;
+  static final int TIMEOUT = 30000;
   private ZkTestServer server;
   private SolrZkClient zkClient;
   
   private Map<Integer,Thread> seqToThread;
   
+  private volatile boolean stopStress = false;
+  
   @BeforeClass
   public static void beforeClass() throws Exception {
     createTempDir();
@@ -65,6 +67,7 @@ public class LeaderElectionTest extends 
         + "zookeeper/server1/data";
     
     server = new ZkTestServer(zkDir);
+    server.setTheTickTime(1000);
     server.run();
     AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
     AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
@@ -75,7 +78,7 @@ public class LeaderElectionTest extends 
   class ClientThread extends Thread {
     SolrZkClient zkClient;
     private int nodeNumber;
-    private int seq = -1;
+    private volatile int seq = -1;
     private volatile boolean stop;
     private volatile boolean electionDone = false;
     private final ZkNodeProps props;
@@ -100,6 +103,8 @@ public class LeaderElectionTest extends 
           seq = elector.joinElection(context);
           electionDone = true;
           seqToThread.put(seq, this);
+        } catch (InterruptedException e) {
+          return;
         } catch (Throwable e) {
           e.printStackTrace();
         }
@@ -108,7 +113,7 @@ public class LeaderElectionTest extends 
         try {
           Thread.sleep(100);
         } catch (InterruptedException e) {
-          // nothing
+          return;
         }
       }
       
@@ -120,6 +125,14 @@ public class LeaderElectionTest extends 
       }
       this.stop = true;
     }
+
+    public int getSeq() {
+      return seq;
+    }
+
+    public int getNodeNumber() {
+      return nodeNumber;
+    }
   }
 
   @Test
@@ -205,6 +218,7 @@ public class LeaderElectionTest extends 
     // cleanup any threads still running
     for (ClientThread thread : threads) {
       thread.close();
+      thread.interrupt();
     }
     
     for (Thread thread : threads) {
@@ -229,16 +243,19 @@ public class LeaderElectionTest extends 
     Thread scheduleThread = new Thread() {
       @Override
       public void run() {
-        for (int i = 0; i < 20; i++) {
-          int launchIn = random.nextInt(2000);
-          ClientThread thread;
+        
+        for (int i = 0; i < 300; i++) {
+          int launchIn = random.nextInt(6000);
+          ClientThread thread = null;
           try {
             thread = new ClientThread(i);
           } catch (Exception e) {
-            throw new RuntimeException(e);
+            //
+          }
+          if (thread != null) {
+            threads.add(thread);
+            scheduler.schedule(thread, launchIn, TimeUnit.MILLISECONDS);
           }
-          threads.add(thread);
-          scheduler.schedule(thread, launchIn, TimeUnit.MILLISECONDS);
         }
       }
     };
@@ -249,7 +266,7 @@ public class LeaderElectionTest extends 
       @Override
       public void run() {
         
-        for (int i = 0; i < 1000; i++) {
+        while (!stopStress) {
           try {
             int j;
             try {
@@ -259,10 +276,12 @@ public class LeaderElectionTest extends 
             }
             try {
               threads.get(j).close();
+            } catch (InterruptedException e) {
+              throw e;
             } catch (Exception e) {
               
             }
-            threads.remove(j);
+
             Thread.sleep(10);
             
           } catch (Exception e) {
@@ -272,15 +291,63 @@ public class LeaderElectionTest extends 
       }
     };
     
+    Thread connLossThread = new Thread() {
+      @Override
+      public void run() {
+        
+        while (!stopStress) {
+          try {
+            int j;
+            try {
+              j = random.nextInt(threads.size());
+            } catch(IllegalArgumentException e) {
+              continue;
+            }
+            try {
+              threads.get(j).zkClient.getSolrZooKeeper().pauseCnxn(ZkTestServer.TICK_TIME * 2);
+            } catch (Exception e) {
+              e.printStackTrace();
+            }
+            Thread.sleep(10);
+            
+          } catch (Exception e) {
+
+          }
+        }
+      }
+    };
+    
+    connLossThread.start();
     killThread.start();
     
+    Thread.sleep(10000);
+    
+    scheduleThread.interrupt();
+    connLossThread.interrupt();
+    killThread.interrupt();
+    
+    stopStress = true;
+    
     scheduleThread.join();
+    connLossThread.join();
     killThread.join();
     
     Thread.sleep(1000);
     
     scheduler.shutdownNow();
     
+
+    //printLayout(server.getZkAddress());
+    
+    
+    System.out.println("leader thread:" + getLeaderThread());
+    int seq = threads.get(getLeaderThread()).getSeq();
+    System.out.println("Seq:" + seq);
+    System.out.println("Node:" + threads.get(getLeaderThread()).getNodeNumber());
+    
+    assertFalse("seq is -1 and we may have a zombie leader", seq == -1);
+   
+    
     // cleanup any threads still running
     for (ClientThread thread : threads) {
       thread.close();
@@ -290,12 +357,12 @@ public class LeaderElectionTest extends 
       thread.join();
     }
 
-    //printLayout(server.getZkAddress());
     
   }
   
   @Override
   public void tearDown() throws Exception {
+    printLayout(server.getZkAddress());
     zkClient.close();
     server.shutdown();
     SolrConfig.severeErrors.clear();

Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/RecoveryZkTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/RecoveryZkTest.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/RecoveryZkTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/RecoveryZkTest.java Wed Jan  4 18:28:20 2012
@@ -22,9 +22,6 @@ import java.io.IOException;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.embedded.JettySolrRunner;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
-import org.apache.solr.client.solrj.request.UpdateRequest;
-import org.apache.solr.cloud.FullSolrCloudTest.StopableIndexingThread;
 import org.apache.solr.common.SolrInputDocument;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;

Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkTestServer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkTestServer.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkTestServer.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkTestServer.java Wed Jan  4 18:28:20 2012
@@ -44,6 +44,8 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public class ZkTestServer {
+  public static final int TICK_TIME = 3000;
+
   private static Logger log = LoggerFactory.getLogger(ZkTestServer.class);
   
   protected final ZKServerMain zkServer = new ZKServerMain();
@@ -53,6 +55,8 @@ public class ZkTestServer {
   private int clientPort;
 
   private Thread zooThread;
+  
+  private int theTickTime = TICK_TIME;
 
   class ZKServerMain {
 
@@ -190,7 +194,7 @@ public class ZkTestServer {
             setClientPort(ZkTestServer.this.clientPort);
             this.dataDir = zkDir;
             this.dataLogDir = zkDir;
-            this.tickTime = 3000;
+            this.tickTime = theTickTime;
           }
           
           public void setClientPort(int clientPort) {
@@ -336,4 +340,12 @@ public class ZkTestServer {
     }
     return alist;
   }
+
+  public int getTheTickTime() {
+    return theTickTime;
+  }
+
+  public void setTheTickTime(int theTickTime) {
+    this.theTickTime = theTickTime;
+  }
 }

Modified: lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java (original)
+++ lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java Wed Jan  4 18:28:20 2012
@@ -42,7 +42,6 @@ import org.apache.zookeeper.SolrZooKeepe
 import org.apache.zookeeper.Watcher;
 import org.apache.zookeeper.ZooDefs;
 import org.apache.zookeeper.ZooKeeper;
-import org.apache.zookeeper.KeeperException.NoNodeException;
 import org.apache.zookeeper.KeeperException.NodeExistsException;
 import org.apache.zookeeper.data.ACL;
 import org.apache.zookeeper.data.Stat;

Modified: lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/zookeeper/SolrZooKeeper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/zookeeper/SolrZooKeeper.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/zookeeper/SolrZooKeeper.java (original)
+++ lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/zookeeper/SolrZooKeeper.java Wed Jan  4 18:28:20 2012
@@ -28,8 +28,8 @@ public class SolrZooKeeper extends ZooKe
             try {
               ((SocketChannel) cnxn.sendThread.sockKey.channel()).socket()
                   .close();
-            } catch (IOException e) {
-              e.printStackTrace();
+            } catch (Exception e) {
+
             }
             Thread.sleep(ms);
           } catch (InterruptedException e) {}