You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2011/12/15 18:04:54 UTC
svn commit: r1214865 - in /lucene/dev/branches/solrcloud/solr: ./
core/src/java/org/apache/solr/cloud/ core/src/java/org/apache/solr/core/
core/src/test/org/apache/solr/cloud/
Author: markrmiller
Date: Thu Dec 15 17:04:53 2011
New Revision: 1214865
URL: http://svn.apache.org/viewvc?rev=1214865&view=rev
Log:
more work on the full chaos monkey test and remove some numShards stuff that is no longer used
Modified:
lucene/dev/branches/solrcloud/solr/ (props changed)
lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java
lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkController.java
lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/core/CoreContainer.java
lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonekyDistributedZkTest.java
lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java
lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java
lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java Thu Dec 15 17:04:53 2011
@@ -18,12 +18,15 @@ package org.apache.solr.cloud;
*/
import java.io.IOException;
+import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.core.RequestHandlers.LazyRequestHandlerWrapper;
import org.apache.solr.core.SolrCore;
@@ -54,6 +57,9 @@ public class RecoveryStrat {
public void finishedRecovery();
}
+ // TODO: we want to be pretty noisy if we don't properly recover?
+ // Also, if we cannot talk to the leader, we need to pause and see if there is
+ // a new leader or continue trying
public void recover(final SolrCore core, final String leaderUrl,
final boolean iamLeader, final OnFinish onFinish) {
log.info("Start recovery process");
@@ -91,17 +97,14 @@ public class RecoveryStrat {
onFinish.run();
} catch (SolrServerException e) {
log.error("", e);
- // nocommit
- e.printStackTrace();
} catch (IOException e) {
log.error("", e);
- // nocommit
- e.printStackTrace();
- } catch (Exception e) {
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
log.error("", e);
- // nocommit
- e.printStackTrace();
- }
+ } catch (ExecutionException e) {
+ log.error("", e);
+ }
log.info("Finished recovery process");
// nocommit: if we get an exception, recovery failed...
}
@@ -111,7 +114,7 @@ public class RecoveryStrat {
}
private void doRecovery(SolrCore core, String leaderUrl, boolean iamleader)
- throws Exception, SolrServerException, IOException {
+ throws SolrServerException, IOException {
// start buffer updates to tran log
// and do recovery - either replay via realtime get (eventually)
@@ -136,6 +139,7 @@ public class RecoveryStrat {
ReplicationHandler replicationHandler = (ReplicationHandler) handler;
if (replicationHandler == null) {
+ // nocommit: we should not just return - we don't want to falsely advertise as active
log.error("Skipping recovery, no /replication handler found");
return;
}
Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkController.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkController.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkController.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkController.java Thu Dec 15 17:04:53 2011
@@ -92,8 +92,6 @@ public final class ZkController {
private String hostName;
private OverseerElector overseerElector;
-
- private int numShards; //not used anywhere now
private Map<String, CoreAssignment> assignments = new HashMap<String, CoreAssignment>();
@@ -139,14 +137,13 @@ public final class ZkController {
* @throws IOException
*/
public ZkController(String zkServerAddress, int zkClientTimeout, int zkClientConnectTimeout, String localHost, String locaHostPort,
- String localHostContext, int numShards, final CurrentCoreDescriptorProvider registerOnReconnect) throws InterruptedException,
+ String localHostContext, final CurrentCoreDescriptorProvider registerOnReconnect) throws InterruptedException,
TimeoutException, IOException {
this.zkServerAddress = zkServerAddress;
this.localHostPort = locaHostPort;
this.localHostContext = localHostContext;
this.localHost = localHost;
- this.numShards = numShards;
zkClient = new SolrZkClient(zkServerAddress, zkClientTimeout, zkClientConnectTimeout,
// on reconnect, reload cloud info
Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/core/CoreContainer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/core/CoreContainer.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/core/CoreContainer.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/core/CoreContainer.java Thu Dec 15 17:04:53 2011
@@ -88,7 +88,6 @@ public class CoreContainer
private SolrZkServer zkServer;
private String zkHost;
- private int numShards;
{
log.info("New CoreContainer " + System.identityHashCode(this));
@@ -165,7 +164,7 @@ public class CoreContainer
} else {
log.info("Zookeeper client=" + zookeeperHost);
}
- zkController = new ZkController(zookeeperHost, zkClientTimeout, zkClientConnectTimeout, host, hostPort, hostContext, numShards, new CurrentCoreDescriptorProvider() {
+ zkController = new ZkController(zookeeperHost, zkClientTimeout, zkClientConnectTimeout, host, hostPort, hostContext, new CurrentCoreDescriptorProvider() {
@Override
public List<CoreDescriptor> getCurrentDescriptors() {
@@ -319,9 +318,6 @@ public class CoreContainer
hostContext = cfg.get("solr/cores/@hostContext", "solr");
host = cfg.get("solr/cores/@host", null);
-
- // TODO: allow override by core so you can change on the fly?
- numShards = cfg.getInt("solr/cores/@numShards", 3);
if(shareSchema){
indexSchemaCache = new ConcurrentHashMap<String ,IndexSchema>();
Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonekyDistributedZkTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonekyDistributedZkTest.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonekyDistributedZkTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonekyDistributedZkTest.java Thu Dec 15 17:04:53 2011
@@ -23,10 +23,8 @@ import org.junit.Ignore;
/**
*
*/
-@Ignore("this test is not ready")
public class ChaosMonekyDistributedZkTest extends FullDistributedZkTest {
-
@BeforeClass
public static void beforeSuperClass() throws Exception {
@@ -38,19 +36,18 @@ public class ChaosMonekyDistributedZkTes
@Override
public void doTest() throws Exception {
+
handle.clear();
handle.put("QTime", SKIPVAL);
handle.put("timestamp", SKIPVAL);
del("*:*");
- indexr(id, 1, i1, 100, tlong, 100, t1, "now is the time for all good men",
- "foo_f", 1.414f, "foo_b", "true", "foo_d", 1.414d);
+ chaosMonkey.startTheMonkey();
- commit();
+ Thread.sleep(12000);
- // these queries should be exactly ordered and scores should exactly match
- query("q", "*:*", "sort", i1 + " desc");
+ chaosMonkey.stopTheMonkey();
}
@Override
Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java Thu Dec 15 17:04:53 2011
@@ -17,6 +17,7 @@ package org.apache.solr.cloud;
* limitations under the License.
*/
+import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Random;
@@ -35,6 +36,7 @@ public class ChaosMonkey {
private ZkStateReader zkStateReader;
private String collection;
private Random random;
+ private volatile boolean stop = false;
public ChaosMonkey(ZkTestServer zkServer, ZkStateReader zkStateReader,
String collection, Map<String,List<CloudJettyRunner>> shardToJetty,
@@ -74,7 +76,7 @@ public class ChaosMonkey {
public void stopShardExcept(String slice, String shardName) throws Exception {
List<CloudJettyRunner> jetties = shardToJetty.get(slice);
for (CloudJettyRunner jetty : jetties) {
- if (!jetty.shardName.equals(shardName)) {
+ if (!jetty.nodeName.equals(shardName)) {
stopJetty(jetty.jetty);
}
}
@@ -87,10 +89,13 @@ public class ChaosMonkey {
public JettySolrRunner stopRandomShard() throws Exception {
// add all the shards to a list
-// CloudState clusterState = zk.getCloudState();
-// for (String collection : collections) {
-// Slice theShards = zk.getCloudState().getSlices(collection);
- return null;
+ Map<String,Slice> slices = zkStateReader.getCloudState().getSlices(collection);
+
+ List<String> sliceKeyList = new ArrayList<String>(slices.size());
+ sliceKeyList.addAll(slices.keySet());
+ String sliceName = sliceKeyList.get(random.nextInt(sliceKeyList.size()));
+
+ return stopRandomShard(sliceName);
}
public JettySolrRunner stopRandomShard(String slice) throws Exception {
@@ -104,6 +109,9 @@ public class ChaosMonkey {
boolean running = true;
ZkNodeProps props = theShards.getShards().get(cloudJetty.shardName);
+ if (props == null) {
+ throw new RuntimeException("shard name " + cloudJetty.shardName + " not found in " + theShards.getShards().keySet());
+ }
String state = props.get(ZkStateReader.STATE_PROP);
String nodeName = props.get(ZkStateReader.NODE_NAME_PROP);
@@ -123,12 +131,55 @@ public class ChaosMonkey {
return null;
}
- // kill random shard in shard2
+ // kill random shard
List<CloudJettyRunner> jetties = shardToJetty.get(slice);
int index = random.nextInt(jetties.size() - 1);
JettySolrRunner jetty = jetties.get(index).jetty;
jetty.stop();
return jetty;
}
+
+ // synchronously starts and stops shards randomly
+ public void startTheMonkey() {
+ stop = false;
+ new Thread() {
+ private List<JettySolrRunner> deadPool = new ArrayList<JettySolrRunner>();
+
+ @Override
+ public void run() {
+ while (!stop) {
+ try {
+ Thread.sleep(500);
+
+ if (random.nextBoolean()) {
+ if (!deadPool.isEmpty()) {
+ System.out.println("start jetty");
+ JettySolrRunner jetty = deadPool.remove(random.nextInt(deadPool.size()));
+ jetty.start();
+ continue;
+ }
+ }
+
+ JettySolrRunner jetty = stopRandomShard();
+ if (jetty == null) {
+ System.out.println("we cannot kill");
+ } else {
+ deadPool.add(jetty);
+ System.out.println("we killed");
+ }
+ } catch (InterruptedException e) {
+ //
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }.start();
+ }
+
+ public void stopTheMonkey() {
+ stop = true;
+ }
}
\ No newline at end of file
Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java Thu Dec 15 17:04:53 2011
@@ -90,12 +90,13 @@ public class FullDistributedZkTest exten
protected Map<String,List<CloudJettyRunner>> shardToJetty = new HashMap<String,List<CloudJettyRunner>>();
private AtomicInteger i = new AtomicInteger(0);
protected ChaosMonkey chaosMonkey;
- private volatile ZkStateReader zkStateReader;
+ protected volatile ZkStateReader zkStateReader;
class CloudJettyRunner {
JettySolrRunner jetty;
+ String nodeName;
String shardName;
}
@@ -138,8 +139,6 @@ public class FullDistributedZkTest exten
public void setUp() throws Exception {
super.setUp();
System.setProperty("numShards", Integer.toString(sliceCount));
-
- chaosMonkey = new ChaosMonkey(zkServer, zkStateReader, DEFAULT_COLLECTION, shardToJetty, random);
}
@BeforeClass
@@ -179,6 +178,8 @@ public class FullDistributedZkTest exten
zkStateReader.createClusterStateWatchersAndUpdate();
}
+
+ chaosMonkey = new ChaosMonkey(zkServer, zkStateReader, DEFAULT_COLLECTION, shardToJetty, random);
}
// wait until shards have started registering...
@@ -313,7 +314,8 @@ public class FullDistributedZkTest exten
}
CloudJettyRunner cjr = new CloudJettyRunner();
cjr.jetty = jetty;
- cjr.shardName = shard.getValue().get(ZkStateReader.NODE_NAME_PROP);
+ cjr.nodeName = shard.getValue().get(ZkStateReader.NODE_NAME_PROP);
+ cjr.shardName = shard.getKey();
list.add(cjr);
}
}
Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java Thu Dec 15 17:04:53 2011
@@ -67,7 +67,7 @@ public class OverseerTest extends SolrTe
System.setProperty(ZkStateReader.NUM_SHARDS_PROP, "3");
zkController = new ZkController(server.getZkAddress(), TIMEOUT, 10000,
- "localhost", "8983", "solr",3, new CurrentCoreDescriptorProvider() {
+ "localhost", "8983", "solr", new CurrentCoreDescriptorProvider() {
@Override
public List<CoreDescriptor> getCurrentDescriptors() {
Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java Thu Dec 15 17:04:53 2011
@@ -72,7 +72,7 @@ public class ZkControllerTest extends So
}
zkClient.close();
ZkController zkController = new ZkController(server.getZkAddress(), TIMEOUT, 10000,
- "localhost", "8983", "/solr", 3, new CurrentCoreDescriptorProvider() {
+ "localhost", "8983", "/solr", new CurrentCoreDescriptorProvider() {
@Override
public List<CoreDescriptor> getCurrentDescriptors() {
@@ -106,7 +106,7 @@ public class ZkControllerTest extends So
AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
zkController = new ZkController(server.getZkAddress(),
- TIMEOUT, 10000, "localhost", "8983", "/solr", 3, new CurrentCoreDescriptorProvider() {
+ TIMEOUT, 10000, "localhost", "8983", "/solr", new CurrentCoreDescriptorProvider() {
@Override
public List<CoreDescriptor> getCurrentDescriptors() {