You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2011/12/15 18:04:54 UTC

svn commit: r1214865 - in /lucene/dev/branches/solrcloud/solr: ./ core/src/java/org/apache/solr/cloud/ core/src/java/org/apache/solr/core/ core/src/test/org/apache/solr/cloud/

Author: markrmiller
Date: Thu Dec 15 17:04:53 2011
New Revision: 1214865

URL: http://svn.apache.org/viewvc?rev=1214865&view=rev
Log:
more work on the full chaos monkey test and remove some numShards stuff that is no longer used

Modified:
    lucene/dev/branches/solrcloud/solr/   (props changed)
    lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java
    lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkController.java
    lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/core/CoreContainer.java
    lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonekyDistributedZkTest.java
    lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java
    lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java
    lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
    lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java

Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java Thu Dec 15 17:04:53 2011
@@ -18,12 +18,15 @@ package org.apache.solr.cloud;
  */
 
 import java.io.IOException;
+import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
 import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.core.RequestHandlers.LazyRequestHandlerWrapper;
 import org.apache.solr.core.SolrCore;
@@ -54,6 +57,9 @@ public class RecoveryStrat {
     public void finishedRecovery();
   }
   
+  // TODO: we want to be pretty noisy if we don't properly recover?
+  // Also, if we cannot talk to the leader, we need to pause and see if there is
+  // a new leader or continue trying
   public void recover(final SolrCore core, final String leaderUrl,
       final boolean iamLeader, final OnFinish onFinish) {
     log.info("Start recovery process");
@@ -91,17 +97,14 @@ public class RecoveryStrat {
           onFinish.run();
         } catch (SolrServerException e) {
           log.error("", e);
-          // nocommit
-          e.printStackTrace();
         } catch (IOException e) {
           log.error("", e);
-          // nocommit
-          e.printStackTrace();
-        } catch (Exception e) {
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
           log.error("", e);
-          // nocommit
-          e.printStackTrace();
-        }
+        } catch (ExecutionException e) {
+          log.error("", e);
+        } 
         log.info("Finished recovery process");
         // nocommit: if we get an exception, recovery failed...
       }
@@ -111,7 +114,7 @@ public class RecoveryStrat {
   }
   
   private void doRecovery(SolrCore core, String leaderUrl, boolean iamleader)
-      throws Exception, SolrServerException, IOException {
+      throws SolrServerException, IOException {
     
     // start buffer updates to tran log
     // and do recovery - either replay via realtime get (eventually)
@@ -136,6 +139,7 @@ public class RecoveryStrat {
       ReplicationHandler replicationHandler = (ReplicationHandler) handler;
       
       if (replicationHandler == null) {
+        // nocommit: we should not just return - we don't want to falsely advertise as active
         log.error("Skipping recovery, no /replication handler found");
         return;
       }

Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkController.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkController.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkController.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/ZkController.java Thu Dec 15 17:04:53 2011
@@ -92,8 +92,6 @@ public final class ZkController {
   private String hostName;
 
   private OverseerElector overseerElector;
-  
-  private int numShards; //not used anywhere now
 
   private Map<String, CoreAssignment> assignments = new HashMap<String, CoreAssignment>();
 
@@ -139,14 +137,13 @@ public final class ZkController {
    * @throws IOException
    */
   public ZkController(String zkServerAddress, int zkClientTimeout, int zkClientConnectTimeout, String localHost, String locaHostPort,
-      String localHostContext, int numShards, final CurrentCoreDescriptorProvider registerOnReconnect) throws InterruptedException,
+      String localHostContext, final CurrentCoreDescriptorProvider registerOnReconnect) throws InterruptedException,
       TimeoutException, IOException {
  
     this.zkServerAddress = zkServerAddress;
     this.localHostPort = locaHostPort;
     this.localHostContext = localHostContext;
     this.localHost = localHost;
-    this.numShards = numShards;
 
     zkClient = new SolrZkClient(zkServerAddress, zkClientTimeout, zkClientConnectTimeout,
         // on reconnect, reload cloud info

Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/core/CoreContainer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/core/CoreContainer.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/core/CoreContainer.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/core/CoreContainer.java Thu Dec 15 17:04:53 2011
@@ -88,7 +88,6 @@ public class CoreContainer 
   private SolrZkServer zkServer;
 
   private String zkHost;
-  private int numShards;
 
   {
     log.info("New CoreContainer " + System.identityHashCode(this));
@@ -165,7 +164,7 @@ public class CoreContainer 
         } else {
           log.info("Zookeeper client=" + zookeeperHost);          
         }
-        zkController = new ZkController(zookeeperHost, zkClientTimeout, zkClientConnectTimeout, host, hostPort, hostContext, numShards, new CurrentCoreDescriptorProvider() {
+        zkController = new ZkController(zookeeperHost, zkClientTimeout, zkClientConnectTimeout, host, hostPort, hostContext, new CurrentCoreDescriptorProvider() {
           
           @Override
           public List<CoreDescriptor> getCurrentDescriptors() {
@@ -319,9 +318,6 @@ public class CoreContainer 
 
     hostContext = cfg.get("solr/cores/@hostContext", "solr");
     host = cfg.get("solr/cores/@host", null);
-    
-    // TODO: allow override by core so you can change on the fly?
-    numShards = cfg.getInt("solr/cores/@numShards", 3);
 
     if(shareSchema){
       indexSchemaCache = new ConcurrentHashMap<String ,IndexSchema>();

Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonekyDistributedZkTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonekyDistributedZkTest.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonekyDistributedZkTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonekyDistributedZkTest.java Thu Dec 15 17:04:53 2011
@@ -23,10 +23,8 @@ import org.junit.Ignore;
 /**
  *
  */
-@Ignore("this test is not ready")
 public class ChaosMonekyDistributedZkTest extends FullDistributedZkTest {
   
-  
   @BeforeClass
   public static void beforeSuperClass() throws Exception {
     
@@ -38,19 +36,18 @@ public class ChaosMonekyDistributedZkTes
   
   @Override
   public void doTest() throws Exception {
+    
     handle.clear();
     handle.put("QTime", SKIPVAL);
     handle.put("timestamp", SKIPVAL);
     
     del("*:*");
     
-    indexr(id, 1, i1, 100, tlong, 100, t1, "now is the time for all good men",
-        "foo_f", 1.414f, "foo_b", "true", "foo_d", 1.414d);
+    chaosMonkey.startTheMonkey();
     
-    commit();
+    Thread.sleep(12000);
     
-    // these queries should be exactly ordered and scores should exactly match
-    query("q", "*:*", "sort", i1 + " desc");
+    chaosMonkey.stopTheMonkey();
   }
   
   @Override

Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java Thu Dec 15 17:04:53 2011
@@ -17,6 +17,7 @@ package org.apache.solr.cloud;
  * limitations under the License.
  */
 
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
@@ -35,6 +36,7 @@ public class ChaosMonkey {
   private ZkStateReader zkStateReader;
   private String collection;
   private Random random;
+  private volatile boolean stop = false;
 
   public ChaosMonkey(ZkTestServer zkServer, ZkStateReader zkStateReader,
       String collection, Map<String,List<CloudJettyRunner>> shardToJetty,
@@ -74,7 +76,7 @@ public class ChaosMonkey {
   public void stopShardExcept(String slice, String shardName) throws Exception {
     List<CloudJettyRunner> jetties = shardToJetty.get(slice);
     for (CloudJettyRunner jetty : jetties) {
-      if (!jetty.shardName.equals(shardName)) {
+      if (!jetty.nodeName.equals(shardName)) {
         stopJetty(jetty.jetty);
       }
     }
@@ -87,10 +89,13 @@ public class ChaosMonkey {
   
   public JettySolrRunner stopRandomShard() throws Exception {
     // add all the shards to a list
-//    CloudState clusterState = zk.getCloudState();
-//    for (String collection : collections)   {
-//    Slice theShards = zk.getCloudState().getSlices(collection);
-    return null;
+    Map<String,Slice> slices = zkStateReader.getCloudState().getSlices(collection);
+    
+    List<String> sliceKeyList = new ArrayList<String>(slices.size());
+    sliceKeyList.addAll(slices.keySet());
+    String sliceName = sliceKeyList.get(random.nextInt(sliceKeyList.size()));
+    
+    return stopRandomShard(sliceName);
   }
   
   public JettySolrRunner stopRandomShard(String slice) throws Exception {
@@ -104,6 +109,9 @@ public class ChaosMonkey {
       boolean running = true;
       
       ZkNodeProps props = theShards.getShards().get(cloudJetty.shardName);
+      if (props == null) {
+        throw new RuntimeException("shard name " + cloudJetty.shardName + " not found in " + theShards.getShards().keySet());
+      }
       String state = props.get(ZkStateReader.STATE_PROP);
       String nodeName = props.get(ZkStateReader.NODE_NAME_PROP);
       
@@ -123,12 +131,55 @@ public class ChaosMonkey {
       return null;
     }
     
-    // kill random shard in shard2
+    // kill random shard
     List<CloudJettyRunner> jetties = shardToJetty.get(slice);
     int index = random.nextInt(jetties.size() - 1);
     JettySolrRunner jetty = jetties.get(index).jetty;
     jetty.stop();
     return jetty;
   }
+  
+  // synchronously starts and stops shards randomly
+  public void startTheMonkey() {
+    stop = false;
+    new Thread() {
+      private List<JettySolrRunner> deadPool = new ArrayList<JettySolrRunner>();
+
+      @Override
+      public void run() {
+        while (!stop) {
+          try {
+            Thread.sleep(500);
+            
+            if (random.nextBoolean()) {
+             if (!deadPool.isEmpty()) {
+               System.out.println("start jetty");
+               JettySolrRunner jetty = deadPool.remove(random.nextInt(deadPool.size()));
+               jetty.start();
+               continue;
+             }
+            }
+            
+            JettySolrRunner jetty = stopRandomShard();
+            if (jetty == null) {
+              System.out.println("we cannot kill");
+            } else {
+              deadPool.add(jetty);
+              System.out.println("we killed");
+            }
+          } catch (InterruptedException e) {
+            //
+          } catch (Exception e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+          }
+        }
+      }
+    }.start();
+  }
+  
+  public void stopTheMonkey() {
+    stop = true;
+  }
 
 }
\ No newline at end of file

Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java Thu Dec 15 17:04:53 2011
@@ -90,12 +90,13 @@ public class FullDistributedZkTest exten
   protected Map<String,List<CloudJettyRunner>> shardToJetty = new HashMap<String,List<CloudJettyRunner>>();
   private AtomicInteger i = new AtomicInteger(0);
   protected ChaosMonkey chaosMonkey;
-  private volatile ZkStateReader zkStateReader;
+  protected volatile ZkStateReader zkStateReader;
 
 
   
   class CloudJettyRunner {
     JettySolrRunner jetty;
+    String nodeName;
     String shardName;
   }
   
@@ -138,8 +139,6 @@ public class FullDistributedZkTest exten
   public void setUp() throws Exception {
     super.setUp();
     System.setProperty("numShards", Integer.toString(sliceCount));
-    
-    chaosMonkey = new ChaosMonkey(zkServer, zkStateReader, DEFAULT_COLLECTION, shardToJetty, random);
   }
   
   @BeforeClass
@@ -179,6 +178,8 @@ public class FullDistributedZkTest exten
         
         zkStateReader.createClusterStateWatchersAndUpdate();
       }
+      
+      chaosMonkey = new ChaosMonkey(zkServer, zkStateReader, DEFAULT_COLLECTION, shardToJetty, random);
     }
     
     // wait until shards have started registering...
@@ -313,7 +314,8 @@ public class FullDistributedZkTest exten
             }
             CloudJettyRunner cjr = new CloudJettyRunner();
             cjr.jetty = jetty;
-            cjr.shardName = shard.getValue().get(ZkStateReader.NODE_NAME_PROP);
+            cjr.nodeName = shard.getValue().get(ZkStateReader.NODE_NAME_PROP);
+            cjr.shardName = shard.getKey();
             list.add(cjr);
           }
         }

Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java Thu Dec 15 17:04:53 2011
@@ -67,7 +67,7 @@ public class OverseerTest extends SolrTe
       System.setProperty(ZkStateReader.NUM_SHARDS_PROP, "3");
 
       zkController = new ZkController(server.getZkAddress(), TIMEOUT, 10000,
-          "localhost", "8983", "solr",3, new CurrentCoreDescriptorProvider() {
+          "localhost", "8983", "solr", new CurrentCoreDescriptorProvider() {
 
             @Override
             public List<CoreDescriptor> getCurrentDescriptors() {

Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java?rev=1214865&r1=1214864&r2=1214865&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java Thu Dec 15 17:04:53 2011
@@ -72,7 +72,7 @@ public class ZkControllerTest extends So
       }
       zkClient.close();
       ZkController zkController = new ZkController(server.getZkAddress(), TIMEOUT, 10000,
-          "localhost", "8983", "/solr", 3, new CurrentCoreDescriptorProvider() {
+          "localhost", "8983", "/solr", new CurrentCoreDescriptorProvider() {
             
             @Override
             public List<CoreDescriptor> getCurrentDescriptors() {
@@ -106,7 +106,7 @@ public class ZkControllerTest extends So
       AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
 
       zkController = new ZkController(server.getZkAddress(),
-          TIMEOUT, 10000, "localhost", "8983", "/solr", 3, new CurrentCoreDescriptorProvider() {
+          TIMEOUT, 10000, "localhost", "8983", "/solr", new CurrentCoreDescriptorProvider() {
             
             @Override
             public List<CoreDescriptor> getCurrentDescriptors() {