You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2012/01/04 19:28:20 UTC
svn commit: r1227255 - in /lucene/dev/branches/solrcloud/solr:
core/src/java/org/apache/solr/client/solrj/embedded/
core/src/java/org/apache/solr/cloud/ core/src/java/org/apache/solr/update/
core/src/test/org/apache/solr/cloud/ solrj/src/java/org/apach...
Author: markrmiller
Date: Wed Jan 4 18:28:20 2012
New Revision: 1227255
URL: http://svn.apache.org/viewvc?rev=1227255&view=rev
Log:
harden leader election against connection loss - also add infra for doing connection loss and expiration testing with ChaosMonkey
Added:
lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkCmdExecutor.java (with props)
lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZooKeeperOperation.java (with props)
Modified:
lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java
lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySolrCloudTest.java
lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java
lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java
lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/RecoveryZkTest.java
lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkTestServer.java
lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java
lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/zookeeper/SolrZooKeeper.java
Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java Wed Jan 4 18:28:20 2012
@@ -34,6 +34,7 @@ import org.mortbay.jetty.servlet.Context
import org.mortbay.jetty.servlet.FilterHolder;
import org.mortbay.jetty.servlet.HashSessionIdManager;
import org.mortbay.log.Logger;
+import org.mortbay.thread.QueuedThreadPool;
/**
* Run solr using jetty
@@ -94,8 +95,22 @@ public class JettySolrRunner {
SocketConnector connector = new SocketConnector();
connector.setPort(port);
connector.setReuseAddress(true);
+ QueuedThreadPool threadPool = (QueuedThreadPool) connector.getThreadPool();
+ if (threadPool != null) {
+ threadPool.setMaxStopTimeMs(100);
+ }
server.setConnectors(new Connector[] { connector });
server.setSessionIdManager(new HashSessionIdManager(new Random()));
+ } else {
+ for (Connector connector : server.getConnectors()) {
+ if (connector instanceof SocketConnector) {
+ QueuedThreadPool threadPool = (QueuedThreadPool) ((SocketConnector) connector)
+ .getThreadPool();
+ if (threadPool != null) {
+ threadPool.setMaxStopTimeMs(100);
+ }
+ }
+ }
}
// Initialize the servlets
Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java Wed Jan 4 18:28:20 2012
@@ -44,19 +44,28 @@ public abstract class ElectionContext {
final class ShardLeaderElectionContext extends ElectionContext {
private final SolrZkClient zkClient;
+ private ZkCmdExecutor proto;
public ShardLeaderElectionContext(final String shardId,
final String collection, final String shardZkNodeName, ZkNodeProps props, SolrZkClient zkClient) {
super(shardZkNodeName, ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/leader_elect/"
+ shardId, ZkStateReader.getShardLeadersPath(collection, shardId),
props);
this.zkClient = zkClient;
+ this.proto = new ZkCmdExecutor(zkClient);
}
@Override
void runLeaderProcess() throws KeeperException, InterruptedException {
- String currentLeaderZkPath = leaderPath;
- zkClient.makePath(currentLeaderZkPath, leaderProps == null ? null
- : ZkStateReader.toJSON(leaderProps), CreateMode.EPHEMERAL);
+ proto.retryOperation(new ZooKeeperOperation() {
+
+ @Override
+ public Object execute() throws KeeperException, InterruptedException {
+ zkClient.makePath(leaderPath, leaderProps == null ? null
+ : ZkStateReader.toJSON(leaderProps), CreateMode.EPHEMERAL);
+ return null;
+ }
+ });
+
}
}
Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java Wed Jan 4 18:28:20 2012
@@ -31,6 +31,7 @@ import org.apache.solr.common.cloud.Solr
import org.apache.solr.common.cloud.ZooKeeperException;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.KeeperException.ConnectionLossException;
import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
@@ -57,12 +58,16 @@ public class LeaderElector {
private static final String ELECTION_NODE = "/election";
- private final static Pattern LEADER_SEQ = Pattern.compile(".*?/?n_(\\d+)");
+ private final static Pattern LEADER_SEQ = Pattern.compile(".*?/?.*?-n_(\\d+)");
+ private final static Pattern SESSION_ID = Pattern.compile(".*?/?(.*?)-n_\\d+");
+
+ private ZkCmdExecutor cmdExecutor;
protected SolrZkClient zkClient;
public LeaderElector(SolrZkClient zkClient) {
this.zkClient = zkClient;
+ cmdExecutor = new ZkCmdExecutor(zkClient);
}
/**
@@ -81,8 +86,16 @@ public class LeaderElector {
private void checkIfIamLeader(final int seq, final ElectionContext context) throws KeeperException,
InterruptedException, IOException {
// get all other numbers...
- String holdElectionPath = context.electionPath + ELECTION_NODE;
- List<String> seqs = zkClient.getChildren(holdElectionPath, null);
+ final String holdElectionPath = context.electionPath + ELECTION_NODE;
+ List<String> seqs = cmdExecutor.retryOperation(new ZooKeeperOperation() {
+
+ @Override
+ public Object execute() throws KeeperException, InterruptedException {
+ return zkClient.getChildren(holdElectionPath, null);
+ }
+ });
+
+
sortSeqs(seqs);
List<Integer> intSeqs = getSeqs(seqs);
if (seq <= intSeqs.get(0)) {
@@ -151,6 +164,18 @@ public class LeaderElector {
return seq;
}
+ private long getSessionId(String nStringSequence) {
+ long id = 0;
+ Matcher m = SESSION_ID.matcher(nStringSequence);
+ if (m.matches()) {
+ id = Long.parseLong(m.group(1));
+ } else {
+ throw new IllegalStateException("Could not find regex match in:"
+ + nStringSequence);
+ }
+ return id;
+ }
+
/**
* Returns int list given list of form n_0000000001, n_0000000003, etc.
*
@@ -182,15 +207,38 @@ public class LeaderElector {
public int joinElection(ElectionContext context) throws KeeperException, InterruptedException, IOException {
final String shardsElectZkPath = context.electionPath + LeaderElector.ELECTION_NODE;
+ long id = zkClient.getSolrZooKeeper().getSessionId();
String leaderSeqPath = null;
boolean cont = true;
int tries = 0;
while (cont) {
try {
-
- leaderSeqPath = zkClient.create(shardsElectZkPath + "/n_", null,
+ leaderSeqPath = zkClient.create(shardsElectZkPath + "/" + id + "-n_", null,
CreateMode.EPHEMERAL_SEQUENTIAL);
cont = false;
+ } catch (ConnectionLossException e) {
+ // we don't know if we made our node or not...
+ List<String> entries = cmdExecutor.retryOperation(new ZooKeeperOperation() {
+
+ @Override
+ public Object execute() throws KeeperException, InterruptedException {
+ return zkClient.getChildren(shardsElectZkPath, null);
+ }
+ });
+
+ boolean foundId = false;
+ for (String entry : entries) {
+ long nodeId = getSessionId(entry);
+ if (id == nodeId) {
+ // we did create our node...
+ foundId = true;
+ break;
+ }
+ }
+ if (!foundId) {
+ throw e;
+ }
+
} catch (KeeperException.NoNodeException e) {
// we must have failed in creating the election node - someone else must
// be working on it, lets try again
@@ -224,7 +272,7 @@ public class LeaderElector {
try {
// leader election node
- if (!zkClient.exists(electZKPath)) {
+ if (!zkClient.exists(electZKPath)) { // on connection loss we throw out an exception
// make new leader election node
zkClient.makePath(electZKPath, CreateMode.PERSISTENT, null);
Added: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkCmdExecutor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkCmdExecutor.java?rev=1227255&view=auto
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkCmdExecutor.java (added)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkCmdExecutor.java Wed Jan 4 18:28:20 2012
@@ -0,0 +1,153 @@
+package org.apache.solr.cloud;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.List;
+
+import org.apache.log4j.Logger;
+import org.apache.solr.common.cloud.SolrZkClient;
+import org.apache.zookeeper.CreateMode;
+import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.ZooDefs;
+import org.apache.zookeeper.data.ACL;
+
+
+public class ZkCmdExecutor {
+ private static final Logger LOG = Logger.getLogger(ZkCmdExecutor.class);
+
+ protected final SolrZkClient solrZkClient;
+ private long retryDelay = 1000L;
+ private int retryCount = 15;
+ private List<ACL> acl = ZooDefs.Ids.OPEN_ACL_UNSAFE;
+
+ public ZkCmdExecutor(SolrZkClient solrZkClient) {
+ this.solrZkClient = solrZkClient;
+ }
+
+ /**
+ * return the acl its using
+ *
+ * @return the acl.
+ */
+ public List<ACL> getAcl() {
+ return acl;
+ }
+
+ /**
+ * set the acl
+ *
+ * @param acl
+ * the acl to set to
+ */
+ public void setAcl(List<ACL> acl) {
+ this.acl = acl;
+ }
+
+ /**
+ * get the retry delay in milliseconds
+ *
+ * @return the retry delay
+ */
+ public long getRetryDelay() {
+ return retryDelay;
+ }
+
+ /**
+ * Sets the time waited between retry delays
+ *
+ * @param retryDelay
+ * the retry delay
+ */
+ public void setRetryDelay(long retryDelay) {
+ this.retryDelay = retryDelay;
+ }
+
+ /**
+ * Perform the given operation, retrying if the connection fails
+ *
+ * @return object. it needs to be cast to the callee's expected return type.
+ */
+ @SuppressWarnings("unchecked")
+ protected <T> T retryOperation(ZooKeeperOperation operation)
+ throws KeeperException, InterruptedException {
+ KeeperException exception = null;
+ for (int i = 0; i < retryCount; i++) {
+ try {
+ return (T) operation.execute();
+ } catch (KeeperException.ConnectionLossException e) {
+ if (exception == null) {
+ exception = e;
+ }
+ retryDelay(i);
+ }
+ }
+ throw exception;
+ }
+
+ /**
+ * Ensures that the given path exists with no data, the current ACL and no
+ * flags
+ *
+ * @param path
+ */
+ protected void ensurePathExists(String path) {
+ ensureExists(path, null, acl, CreateMode.PERSISTENT);
+ }
+
+ /**
+ * Ensures that the given path exists with the given data, ACL and flags
+ *
+ * @param path
+ * @param acl
+ * @param flags
+ */
+ protected void ensureExists(final String path, final byte[] data,
+ final List<ACL> acl, final CreateMode flags) {
+ try {
+ retryOperation(new ZooKeeperOperation() {
+ public Object execute() throws KeeperException, InterruptedException {
+ if (solrZkClient.exists(path)) {
+ return true;
+ }
+ solrZkClient.create(path, data, acl, flags);
+ return true;
+ }
+ });
+ } catch (KeeperException e) {
+ LOG.warn("", e);
+ } catch (InterruptedException e) {
+ LOG.warn("", e);
+ }
+ }
+
+ /**
+ * Performs a retry delay if this is not the first attempt
+ *
+ * @param attemptCount
+ * the number of the attempts performed so far
+ */
+ protected void retryDelay(int attemptCount) {
+ if (attemptCount > 0) {
+ try {
+ Thread.sleep(attemptCount * retryDelay);
+ } catch (InterruptedException e) {
+ LOG.debug("Failed to sleep: " + e, e);
+ }
+ }
+ }
+}
Added: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZooKeeperOperation.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZooKeeperOperation.java?rev=1227255&view=auto
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZooKeeperOperation.java (added)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZooKeeperOperation.java Wed Jan 4 18:28:20 2012
@@ -0,0 +1,38 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.cloud;
+
+import org.apache.zookeeper.KeeperException;
+
+/**
+ * A callback object which can be used for implementing retry-able operations in the
+ * {@link org.apache.zookeeper.ZkCmdExecutor.lock.ProtocolSupport} class
+ *
+ */
+public interface ZooKeeperOperation {
+
+ /**
+ * Performs the operation - which may be involved multiple times if the connection
+ * to ZooKeeper closes during this operation
+ *
+ * @return the result of the operation or null
+ * @throws KeeperException
+ * @throws InterruptedException
+ */
+ public Object execute() throws KeeperException, InterruptedException;
+}
Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java Wed Jan 4 18:28:20 2012
@@ -363,7 +363,7 @@ public class SolrCmdDistributor {
response.errors.add(error);
response.sreq = sreq;
SolrException.logOnce(SolrCore.log, "shard update error "
- + sreq.node + " (" + sreq.node + ")", sreq.exception);
+ + sreq.node, sreq.exception);
}
}
@@ -419,7 +419,7 @@ public class SolrCmdDistributor {
@Override
public String toString() {
- return url;
+ return this.getClass().getSimpleName() + ": " + url;
}
@Override
Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java Wed Jan 4 18:28:20 2012
@@ -24,11 +24,14 @@ import java.util.Map;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.cloud.FullSolrCloudTest.CloudJettyRunner;
import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
+import org.apache.solr.core.CoreContainer;
import org.apache.solr.servlet.SolrDispatchFilter;
import org.apache.zookeeper.KeeperException;
import org.mortbay.jetty.servlet.FilterHolder;
@@ -43,6 +46,8 @@ import org.mortbay.jetty.servlet.FilterH
public class ChaosMonkey {
private static final boolean DONTKILLLEADER = true;
+ protected static final boolean EXPIRE_SESSIONS = false;
+ protected static final boolean CAUSE_CONNECTION_LOSS = false;
private Map<String,List<CloudJettyRunner>> shardToJetty;
private ZkTestServer zkServer;
private ZkStateReader zkStateReader;
@@ -52,11 +57,15 @@ public class ChaosMonkey {
private AtomicInteger stops = new AtomicInteger();
private AtomicInteger starts = new AtomicInteger();
private AtomicInteger expires = new AtomicInteger();
+ private AtomicInteger connloss = new AtomicInteger();
+
+ private Map<String,List<SolrServer>> shardToClient;
public ChaosMonkey(ZkTestServer zkServer, ZkStateReader zkStateReader,
String collection, Map<String,List<CloudJettyRunner>> shardToJetty,
- Random random) {
+ Map<String,List<SolrServer>> shardToClient, Random random) {
this.shardToJetty = shardToJetty;
+ this.shardToClient = shardToClient;
this.zkServer = zkServer;
this.zkStateReader = zkStateReader;
this.collection = collection;
@@ -70,17 +79,37 @@ public class ChaosMonkey {
}
public void expireRandomSession() throws KeeperException, InterruptedException {
- Map<String,Slice> slices = zkStateReader.getCloudState().getSlices(collection);
- List<String> sliceKeyList = new ArrayList<String>(slices.size());
- sliceKeyList.addAll(slices.keySet());
- String sliceName = sliceKeyList.get(random.nextInt(sliceKeyList.size()));
+ String sliceName = getRandomSlice();
- JettySolrRunner jetty = getRandomSacraficialShard(sliceName, DONTKILLLEADER);
+ JettySolrRunner jetty = getRandomSacraficialJetty(sliceName, DONTKILLLEADER);
if (jetty != null) {
expireSession(jetty);
}
}
+ public void randomConnectionLoss() throws KeeperException, InterruptedException {
+ String sliceName = getRandomSlice();
+
+ JettySolrRunner jetty = getRandomSacraficialJetty(sliceName, DONTKILLLEADER);
+ if (jetty != null) {
+ System.out.println("cause connection loss");
+ causeConnectionLoss(jetty);
+ }
+ }
+
+ private void causeConnectionLoss(JettySolrRunner jetty) {
+ SolrDispatchFilter solrDispatchFilter = (SolrDispatchFilter) jetty
+ .getDispatchFilter().getFilter();
+ if (solrDispatchFilter != null) {
+ CoreContainer cores = solrDispatchFilter.getCores();
+ if (cores != null) {
+ SolrZkClient zkClient = cores.getZkController().getZkClient();
+ // over double tick time...
+ zkClient.getSolrZooKeeper().pauseCnxn(ZkTestServer.TICK_TIME * 3);
+ }
+ }
+ }
+
public JettySolrRunner stopShard(String slice, int index) throws Exception {
JettySolrRunner jetty = shardToJetty.get(slice).get(index).jetty;
stopJetty(jetty);
@@ -163,18 +192,13 @@ public class ChaosMonkey {
}
public JettySolrRunner stopRandomShard() throws Exception {
- // add all the shards to a list
- Map<String,Slice> slices = zkStateReader.getCloudState().getSlices(collection);
-
- List<String> sliceKeyList = new ArrayList<String>(slices.size());
- sliceKeyList.addAll(slices.keySet());
- String sliceName = sliceKeyList.get(random.nextInt(sliceKeyList.size()));
+ String sliceName = getRandomSlice();
return stopRandomShard(sliceName);
}
public JettySolrRunner stopRandomShard(String slice) throws Exception {
- JettySolrRunner jetty = getRandomSacraficialShard(slice, DONTKILLLEADER);
+ JettySolrRunner jetty = getRandomSacraficialJetty(slice, DONTKILLLEADER);
if (jetty != null) {
stopJetty(jetty);
}
@@ -184,24 +208,29 @@ public class ChaosMonkey {
public JettySolrRunner killRandomShard() throws Exception {
// add all the shards to a list
+ String sliceName = getRandomSlice();
+
+ return killRandomShard(sliceName);
+ }
+
+ private String getRandomSlice() {
Map<String,Slice> slices = zkStateReader.getCloudState().getSlices(collection);
List<String> sliceKeyList = new ArrayList<String>(slices.size());
sliceKeyList.addAll(slices.keySet());
String sliceName = sliceKeyList.get(random.nextInt(sliceKeyList.size()));
-
- return killRandomShard(sliceName);
+ return sliceName;
}
public JettySolrRunner killRandomShard(String slice) throws Exception {
- JettySolrRunner jetty = getRandomSacraficialShard(slice, DONTKILLLEADER);
+ JettySolrRunner jetty = getRandomSacraficialJetty(slice, DONTKILLLEADER);
if (jetty != null) {
killJetty(jetty);
}
return jetty;
}
- public JettySolrRunner getRandomSacraficialShard(String slice, boolean dontkillleader) throws KeeperException, InterruptedException {
+ public JettySolrRunner getRandomSacraficialJetty(String slice, boolean dontkillleader) throws KeeperException, InterruptedException {
// get latest cloud state
zkStateReader.updateCloudState(true);
Slice theShards = zkStateReader.getCloudState().getSlices(collection)
@@ -307,9 +336,13 @@ public class ChaosMonkey {
int rnd = random.nextInt(10);
// nocommit: we dont randomly expire yet
- if (false && rnd < 2) {
+ if (EXPIRE_SESSIONS && rnd < 2) {
expireRandomSession();
expires.incrementAndGet();
+ } else if (CAUSE_CONNECTION_LOSS && rnd < 4) {
+
+ randomConnectionLoss();
+ connloss.incrementAndGet();
} else {
JettySolrRunner jetty;
if (random.nextBoolean()) {
@@ -332,7 +365,8 @@ public class ChaosMonkey {
}
System.out.println("I stopped " + stops + " and I started " + starts
- + ". I also expired " + expires.get());
+ + ". I also expired " + expires.get() + " and caused " + connloss
+ + " connection losses");
}
}.start();
}
Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySolrCloudTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySolrCloudTest.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySolrCloudTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySolrCloudTest.java Wed Jan 4 18:28:20 2012
@@ -22,13 +22,21 @@ import java.util.List;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.common.SolrInputDocument;
+import org.junit.AfterClass;
import org.junit.BeforeClass;
public class ChaosMonkeySolrCloudTest extends FullSolrCloudTest {
@BeforeClass
public static void beforeSuperClass() throws Exception {
-
+ // we expect this time of exception as shards go up and down...
+ ignoreException("shard update error ");
+ ignoreException("Connection refused");
+ }
+
+ @AfterClass
+ public static void afterSuperClass() throws Exception {
+ resetExceptionIgnores();
}
public ChaosMonkeySolrCloudTest() {
@@ -53,7 +61,7 @@ public class ChaosMonkeySolrCloudTest ex
chaosMonkey.startTheMonkey();
- Thread.sleep(16000);
+ Thread.sleep(48000);
chaosMonkey.stopTheMonkey();
@@ -82,8 +90,7 @@ public class ChaosMonkeySolrCloudTest ex
//assertEquals(chaosMonkey.getStarts(), getNumberOfRecoveryAttempts() - shardCount - sliceCount);
- // does not always pass yet
- checkShardConsistency(true);
+ checkShardConsistency(false);
if (VERBOSE) System.out.println("control docs:" + controlClient.query(new SolrQuery("*:*")).getResults().getNumFound() + "\n\n");
}
Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java Wed Jan 4 18:28:20 2012
@@ -178,7 +178,7 @@ public class FullSolrCloudTest extends A
zkStateReader.createClusterStateWatchersAndUpdate();
}
- chaosMonkey = new ChaosMonkey(zkServer, zkStateReader, DEFAULT_COLLECTION, shardToJetty, random);
+ chaosMonkey = new ChaosMonkey(zkServer, zkStateReader, DEFAULT_COLLECTION, shardToJetty, shardToClient, random);
}
// wait until shards have started registering...
Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java Wed Jan 4 18:28:20 2012
@@ -41,12 +41,14 @@ import org.junit.Test;
public class LeaderElectionTest extends SolrTestCaseJ4 {
- static final int TIMEOUT = 10000;
+ static final int TIMEOUT = 30000;
private ZkTestServer server;
private SolrZkClient zkClient;
private Map<Integer,Thread> seqToThread;
+ private volatile boolean stopStress = false;
+
@BeforeClass
public static void beforeClass() throws Exception {
createTempDir();
@@ -65,6 +67,7 @@ public class LeaderElectionTest extends
+ "zookeeper/server1/data";
server = new ZkTestServer(zkDir);
+ server.setTheTickTime(1000);
server.run();
AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
@@ -75,7 +78,7 @@ public class LeaderElectionTest extends
class ClientThread extends Thread {
SolrZkClient zkClient;
private int nodeNumber;
- private int seq = -1;
+ private volatile int seq = -1;
private volatile boolean stop;
private volatile boolean electionDone = false;
private final ZkNodeProps props;
@@ -100,6 +103,8 @@ public class LeaderElectionTest extends
seq = elector.joinElection(context);
electionDone = true;
seqToThread.put(seq, this);
+ } catch (InterruptedException e) {
+ return;
} catch (Throwable e) {
e.printStackTrace();
}
@@ -108,7 +113,7 @@ public class LeaderElectionTest extends
try {
Thread.sleep(100);
} catch (InterruptedException e) {
- // nothing
+ return;
}
}
@@ -120,6 +125,14 @@ public class LeaderElectionTest extends
}
this.stop = true;
}
+
+ public int getSeq() {
+ return seq;
+ }
+
+ public int getNodeNumber() {
+ return nodeNumber;
+ }
}
@Test
@@ -205,6 +218,7 @@ public class LeaderElectionTest extends
// cleanup any threads still running
for (ClientThread thread : threads) {
thread.close();
+ thread.interrupt();
}
for (Thread thread : threads) {
@@ -229,16 +243,19 @@ public class LeaderElectionTest extends
Thread scheduleThread = new Thread() {
@Override
public void run() {
- for (int i = 0; i < 20; i++) {
- int launchIn = random.nextInt(2000);
- ClientThread thread;
+
+ for (int i = 0; i < 300; i++) {
+ int launchIn = random.nextInt(6000);
+ ClientThread thread = null;
try {
thread = new ClientThread(i);
} catch (Exception e) {
- throw new RuntimeException(e);
+ //
+ }
+ if (thread != null) {
+ threads.add(thread);
+ scheduler.schedule(thread, launchIn, TimeUnit.MILLISECONDS);
}
- threads.add(thread);
- scheduler.schedule(thread, launchIn, TimeUnit.MILLISECONDS);
}
}
};
@@ -249,7 +266,7 @@ public class LeaderElectionTest extends
@Override
public void run() {
- for (int i = 0; i < 1000; i++) {
+ while (!stopStress) {
try {
int j;
try {
@@ -259,10 +276,12 @@ public class LeaderElectionTest extends
}
try {
threads.get(j).close();
+ } catch (InterruptedException e) {
+ throw e;
} catch (Exception e) {
}
- threads.remove(j);
+
Thread.sleep(10);
} catch (Exception e) {
@@ -272,15 +291,63 @@ public class LeaderElectionTest extends
}
};
+ Thread connLossThread = new Thread() {
+ @Override
+ public void run() {
+
+ while (!stopStress) {
+ try {
+ int j;
+ try {
+ j = random.nextInt(threads.size());
+ } catch(IllegalArgumentException e) {
+ continue;
+ }
+ try {
+ threads.get(j).zkClient.getSolrZooKeeper().pauseCnxn(ZkTestServer.TICK_TIME * 2);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ Thread.sleep(10);
+
+ } catch (Exception e) {
+
+ }
+ }
+ }
+ };
+
+ connLossThread.start();
killThread.start();
+ Thread.sleep(10000);
+
+ scheduleThread.interrupt();
+ connLossThread.interrupt();
+ killThread.interrupt();
+
+ stopStress = true;
+
scheduleThread.join();
+ connLossThread.join();
killThread.join();
Thread.sleep(1000);
scheduler.shutdownNow();
+
+ //printLayout(server.getZkAddress());
+
+
+ System.out.println("leader thread:" + getLeaderThread());
+ int seq = threads.get(getLeaderThread()).getSeq();
+ System.out.println("Seq:" + seq);
+ System.out.println("Node:" + threads.get(getLeaderThread()).getNodeNumber());
+
+ assertFalse("seq is -1 and we may have a zombie leader", seq == -1);
+
+
// cleanup any threads still running
for (ClientThread thread : threads) {
thread.close();
@@ -290,12 +357,12 @@ public class LeaderElectionTest extends
thread.join();
}
- //printLayout(server.getZkAddress());
}
@Override
public void tearDown() throws Exception {
+ printLayout(server.getZkAddress());
zkClient.close();
server.shutdown();
SolrConfig.severeErrors.clear();
Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/RecoveryZkTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/RecoveryZkTest.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/RecoveryZkTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/RecoveryZkTest.java Wed Jan 4 18:28:20 2012
@@ -22,9 +22,6 @@ import java.io.IOException;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
-import org.apache.solr.client.solrj.request.UpdateRequest;
-import org.apache.solr.cloud.FullSolrCloudTest.StopableIndexingThread;
import org.apache.solr.common.SolrInputDocument;
import org.junit.AfterClass;
import org.junit.BeforeClass;
Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkTestServer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkTestServer.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkTestServer.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkTestServer.java Wed Jan 4 18:28:20 2012
@@ -44,6 +44,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ZkTestServer {
+ public static final int TICK_TIME = 3000;
+
private static Logger log = LoggerFactory.getLogger(ZkTestServer.class);
protected final ZKServerMain zkServer = new ZKServerMain();
@@ -53,6 +55,8 @@ public class ZkTestServer {
private int clientPort;
private Thread zooThread;
+
+ private int theTickTime = TICK_TIME;
class ZKServerMain {
@@ -190,7 +194,7 @@ public class ZkTestServer {
setClientPort(ZkTestServer.this.clientPort);
this.dataDir = zkDir;
this.dataLogDir = zkDir;
- this.tickTime = 3000;
+ this.tickTime = theTickTime;
}
public void setClientPort(int clientPort) {
@@ -336,4 +340,12 @@ public class ZkTestServer {
}
return alist;
}
+
+ public int getTheTickTime() {
+ return theTickTime;
+ }
+
+ public void setTheTickTime(int theTickTime) {
+ this.theTickTime = theTickTime;
+ }
}
Modified: lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java (original)
+++ lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java Wed Jan 4 18:28:20 2012
@@ -42,7 +42,6 @@ import org.apache.zookeeper.SolrZooKeepe
import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.ZooDefs;
import org.apache.zookeeper.ZooKeeper;
-import org.apache.zookeeper.KeeperException.NoNodeException;
import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.apache.zookeeper.data.ACL;
import org.apache.zookeeper.data.Stat;
Modified: lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/zookeeper/SolrZooKeeper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/zookeeper/SolrZooKeeper.java?rev=1227255&r1=1227254&r2=1227255&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/zookeeper/SolrZooKeeper.java (original)
+++ lucene/dev/branches/solrcloud/solr/solrj/src/java/org/apache/zookeeper/SolrZooKeeper.java Wed Jan 4 18:28:20 2012
@@ -28,8 +28,8 @@ public class SolrZooKeeper extends ZooKe
try {
((SocketChannel) cnxn.sendThread.sockKey.channel()).socket()
.close();
- } catch (IOException e) {
- e.printStackTrace();
+ } catch (Exception e) {
+
}
Thread.sleep(ms);
} catch (InterruptedException e) {}