You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2012/09/14 23:05:16 UTC

svn commit: r1384923 [2/2] - in /lucene/dev/trunk: lucene/test-framework/src/java/org/apache/lucene/util/ solr/ solr/core/src/java/org/apache/solr/cloud/ solr/core/src/java/org/apache/solr/core/ solr/core/src/java/org/apache/solr/handler/admin/ solr/co...

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/SolrCmdDistributorTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/SolrCmdDistributorTest.java?rev=1384923&r1=1384922&r2=1384923&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/SolrCmdDistributorTest.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/SolrCmdDistributorTest.java Fri Sep 14 21:05:15 2012
@@ -36,6 +36,7 @@ import org.apache.solr.common.cloud.ZkCo
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.util.ExecutorUtil;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.update.SolrCmdDistributor.Node;
 import org.apache.solr.update.SolrCmdDistributor.Response;
@@ -43,16 +44,13 @@ import org.apache.solr.update.SolrCmdDis
 import org.apache.solr.util.DefaultSolrThreadFactory;
 
 public class SolrCmdDistributorTest extends BaseDistributedSearchTestCase {
-  private ThreadPoolExecutor executor = new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5,
-      TimeUnit.SECONDS, new SynchronousQueue<Runnable>(),
-      new DefaultSolrThreadFactory("cmdDistribExecutor"));
+  private static ThreadPoolExecutor executor;
   
   public SolrCmdDistributorTest() {
     fixShardCount = true;
     shardCount = 4;
     stress = 0;
   }
-  
 
   public static String getSchemaFile() {
     return "schema.xml";
@@ -91,7 +89,7 @@ public class SolrCmdDistributorTest exte
   public void doTest() throws Exception {
     del("*:*");
     
-    SolrCmdDistributor cmdDistrib = new SolrCmdDistributor(8, executor);
+    SolrCmdDistributor cmdDistrib = new SolrCmdDistributor(5, executor);
     
     ModifiableSolrParams params = new ModifiableSolrParams();
     List<Node> nodes = new ArrayList<Node>();
@@ -125,7 +123,7 @@ public class SolrCmdDistributorTest exte
     nodes.add(new StdNode(new ZkCoreNodeProps(nodeProps)));
     
     // add another 2 docs to control and 3 to client
-    cmdDistrib = new SolrCmdDistributor(8, executor);
+    cmdDistrib = new SolrCmdDistributor(5, executor);
     cmd.solrDoc = sdoc("id", 2);
     cmdDistrib.distribAdd(cmd, nodes, params);
     
@@ -158,7 +156,7 @@ public class SolrCmdDistributorTest exte
     DeleteUpdateCommand dcmd = new DeleteUpdateCommand(null);
     dcmd.id = "2";
     
-    cmdDistrib = new SolrCmdDistributor(8, executor);
+    cmdDistrib = new SolrCmdDistributor(5, executor);
     cmdDistrib.distribDelete(dcmd, nodes, params);
     
     cmdDistrib.distribCommit(ccmd, nodes, params);
@@ -177,19 +175,16 @@ public class SolrCmdDistributorTest exte
         .getNumFound();
     assertEquals(results.toString(), 2, numFound);
     
-    // debug stuff
     for (SolrServer c : clients) {
       c.optimize();
-      // distrib optimize is not working right yet, so call it on each client
       //System.out.println(clients.get(0).request(new LukeRequest()));
     }
     
     int id = 5;
     
-    cmdDistrib = new SolrCmdDistributor(8, executor);
+    cmdDistrib = new SolrCmdDistributor(5, executor);
     
-    nodes.clear();
-    int cnt = atLeast(200);
+    int cnt = atLeast(201);
     for (int i = 0; i < cnt; i++) {
       nodes.clear();
       for (SolrServer c : clients) {
@@ -199,25 +194,52 @@ public class SolrCmdDistributorTest exte
         HttpSolrServer httpClient = (HttpSolrServer) c;
         nodeProps = new ZkNodeProps(ZkStateReader.BASE_URL_PROP,
             httpClient.getBaseURL(), ZkStateReader.CORE_NAME_PROP, "");
+        
         nodes.add(new StdNode(new ZkCoreNodeProps(nodeProps)));
 
       }
+      AddUpdateCommand c = new AddUpdateCommand(null);
+      c.solrDoc = sdoc("id", id++);
+      if (nodes.size() > 0) {
+        cmdDistrib.distribAdd(c, nodes, params);
+      }
+    }
+    
+    nodes.clear();
+    
+    for (SolrServer c : clients) {
+      HttpSolrServer httpClient = (HttpSolrServer) c;
+      nodeProps = new ZkNodeProps(ZkStateReader.BASE_URL_PROP,
+          httpClient.getBaseURL(), ZkStateReader.CORE_NAME_PROP, "");
       
-      cmd.solrDoc = sdoc("id", id++);
-      cmdDistrib.distribAdd(cmd, nodes, params);
+      nodes.add(new StdNode(new ZkCoreNodeProps(nodeProps)));
     }
 
-    cmdDistrib.finish();
-    
     cmdDistrib.distribCommit(ccmd, nodes, params);
     
+    
+    cmdDistrib.finish();
+    
     for (SolrServer c : clients) {
       NamedList<Object> resp = c.request(new LukeRequest());
-      System.out.println(resp);
       assertEquals("SOLR-3428: We only did adds - there should be no deletes",
           ((NamedList<Object>) resp.get("index")).get("numDocs"),
           ((NamedList<Object>) resp.get("index")).get("maxDoc"));
     }
 
   }
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    executor = new ThreadPoolExecutor(0, 5 * 16, 5,
+        TimeUnit.SECONDS, new SynchronousQueue<Runnable>(),
+        new DefaultSolrThreadFactory("cmdDistribExecutor"));
+  }
+  
+  @Override
+  public void tearDown() throws Exception {
+    ExecutorUtil.shutdownNowAndAwaitTermination(executor);
+    super.tearDown();
+  }
 }

Modified: lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/client/solrj/request/CoreAdminRequest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/client/solrj/request/CoreAdminRequest.java?rev=1384923&r1=1384922&r2=1384923&view=diff
==============================================================================
--- lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/client/solrj/request/CoreAdminRequest.java (original)
+++ lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/client/solrj/request/CoreAdminRequest.java Fri Sep 14 21:05:15 2012
@@ -120,7 +120,6 @@ public class CoreAdminRequest extends So
     protected String coreNodeName;
     protected String state;
     protected Boolean checkLive;
-    protected Integer pauseFor;
     protected Boolean onlyIfLeader;
     
 
@@ -160,14 +159,6 @@ public class CoreAdminRequest extends So
       this.checkLive = checkLive;
     }
     
-    public Integer getPauseFor() {
-      return pauseFor;
-    }
-
-    public void setPauseFor(Integer pauseFor) {
-      this.pauseFor = pauseFor;
-    }
-    
     public boolean isOnlyIfLeader() {
       return onlyIfLeader;
     }
@@ -202,10 +193,6 @@ public class CoreAdminRequest extends So
         params.set( "checkLive", checkLive);
       }
       
-      if (pauseFor != null) {
-        params.set( "pauseFor", pauseFor);
-      }
-      
       if (onlyIfLeader != null) {
         params.set( "onlyIfLeader", onlyIfLeader);
       }

Modified: lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java?rev=1384923&r1=1384922&r2=1384923&view=diff
==============================================================================
--- lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java (original)
+++ lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java Fri Sep 14 21:05:15 2012
@@ -410,7 +410,7 @@ public class ZkStateReader {
       }
       Thread.sleep(50);
     }
-    throw new RuntimeException("No registered leader was found, collection:" + collection + " slice:" + shard);
+    throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "No registered leader was found, collection:" + collection + " slice:" + shard);
   }
 
   /**

Modified: lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ExecutorUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ExecutorUtil.java?rev=1384923&r1=1384922&r2=1384923&view=diff
==============================================================================
--- lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ExecutorUtil.java (original)
+++ lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ExecutorUtil.java Fri Sep 14 21:05:15 2012
@@ -20,7 +20,6 @@ package org.apache.solr.common.util;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.TimeUnit;
 
-import org.apache.solr.common.SolrException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -28,24 +27,38 @@ import org.slf4j.LoggerFactory;
 public class ExecutorUtil {
   public static Logger log = LoggerFactory.getLogger(ExecutorUtil.class);
   
-  public static void shutdownAndAwaitTermination(ExecutorService pool) {
+  public static void shutdownNowAndAwaitTermination(ExecutorService pool) {
     pool.shutdown(); // Disable new tasks from being submitted
     pool.shutdownNow(); // Cancel currently executing tasks
-    try {
-      // Wait a while for existing tasks to terminate
-        if (!pool.awaitTermination(60, TimeUnit.SECONDS))
-            SolrException.log(log, "Executor still has running tasks.");
-    } catch (InterruptedException ie) {
-      // (Re-)Cancel if current thread also interrupted
-      pool.shutdownNow();
+    boolean shutdown = false;
+    while (!shutdown) {
+      try {
+        // Wait a while for existing tasks to terminate
+        shutdown = pool.awaitTermination(5, TimeUnit.SECONDS);
+      } catch (InterruptedException ie) {
+        // Preserve interrupt status
+        Thread.currentThread().interrupt();
+      }
+      if (!shutdown) {
+        pool.shutdownNow(); // Cancel currently executing tasks
+      }
+    }
+  }
+  
+  public static void shutdownAndAwaitTermination(ExecutorService pool) {
+    pool.shutdown(); // Disable new tasks from being submitted
+    boolean shutdown = false;
+    while (!shutdown) {
       try {
-        if (!pool.awaitTermination(60, TimeUnit.SECONDS))
-          SolrException.log(log, "Executor still has running tasks.");
-      } catch (InterruptedException e) {
-    
+        // Wait a while for existing tasks to terminate
+        shutdown = pool.awaitTermination(60, TimeUnit.SECONDS);
+      } catch (InterruptedException ie) {
+        // Preserve interrupt status
+        Thread.currentThread().interrupt();
+      }
+      if (!shutdown) {
+        pool.shutdownNow(); // Cancel currently executing tasks
       }
-      // Preserve interrupt status
-      Thread.currentThread().interrupt();
     }
   }
 }

Modified: lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java?rev=1384923&r1=1384922&r2=1384923&view=diff
==============================================================================
--- lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java (original)
+++ lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java Fri Sep 14 21:05:15 2012
@@ -242,7 +242,7 @@ public abstract class AbstractFullDistri
           server.getLbServer().getHttpClient().getParams()
               .setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 5000);
           server.getLbServer().getHttpClient().getParams()
-              .setParameter(CoreConnectionPNames.SO_TIMEOUT, 40000);
+              .setParameter(CoreConnectionPNames.SO_TIMEOUT, 20000);
           cloudClient = server;
         } catch (MalformedURLException e) {
           throw new RuntimeException(e);
@@ -891,7 +891,7 @@ public abstract class AbstractFullDistri
               cnt += results;
               break;
             }
-          } catch (SolrServerException e) {
+          } catch (Exception e) {
             // if we have a problem, try the next one
             if (i == times - 1) {
               throw e;

Modified: lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java?rev=1384923&r1=1384922&r2=1384923&view=diff
==============================================================================
--- lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java (original)
+++ lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java Fri Sep 14 21:05:15 2012
@@ -32,6 +32,7 @@ import org.apache.solr.common.cloud.Solr
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.core.CoreContainer;
+import org.apache.solr.core.SolrCore;
 import org.apache.solr.servlet.SolrDispatchFilter;
 import org.apache.zookeeper.KeeperException;
 import org.eclipse.jetty.servlet.FilterHolder;
@@ -43,13 +44,16 @@ import org.slf4j.LoggerFactory;
  * 
  * It can also run in a background thread and start and stop jetties
  * randomly.
- *
+ * TODO: expire multiple sessions / connectionloss at once
+ * TODO: kill multiple jetties at once
+ * TODO: ? add random headhunter mode that always kills the leader
+ * TODO: chaosmonkey should be able to do cluster stop/start tests
  */
 public class ChaosMonkey {
   private static Logger log = LoggerFactory.getLogger(ChaosMonkey.class);
   
-  private static final int CONLOSS_PERCENT = 3; //30%
-  private static final int EXPIRE_PERCENT = 4; //40%
+  private static final int CONLOSS_PERCENT = 10; // 0 - 10 = 0 - 100%
+  private static final int EXPIRE_PERCENT = 10; // 0 - 10 = 0 - 100%
   private Map<String,List<CloudJettyRunner>> shardToJetty;
   
   private ZkTestServer zkServer;
@@ -79,22 +83,50 @@ public class ChaosMonkey {
     this.zkStateReader = zkStateReader;
     this.collection = collection;
     Random random = LuceneTestCase.random();
-    expireSessions = random.nextBoolean();
-    causeConnectionLoss = random.nextBoolean();
+    expireSessions = true; //= random.nextBoolean();
+    
+    causeConnectionLoss = true;//= random.nextBoolean();
     monkeyLog("init - expire sessions:" + expireSessions
         + " cause connection loss:" + causeConnectionLoss);
   }
   
-  public void expireSession(JettySolrRunner jetty) {
+  // TODO: expire all clients at once?
+  public void expireSession(final JettySolrRunner jetty) {
     monkeyLog("expire session for " + jetty.getLocalPort() + " !");
-    SolrDispatchFilter solrDispatchFilter = (SolrDispatchFilter) jetty.getDispatchFilter().getFilter();
+    
+    SolrDispatchFilter solrDispatchFilter = (SolrDispatchFilter) jetty
+        .getDispatchFilter().getFilter();
     if (solrDispatchFilter != null) {
       CoreContainer cores = solrDispatchFilter.getCores();
       if (cores != null) {
-        long sessionId = cores.getZkController().getZkClient().getSolrZooKeeper().getSessionId();
-        zkServer.expire(sessionId);
+        causeConnectionLoss(jetty, cores.getZkController().getClientTimeout() + 200);
       }
     }
+    
+
+//    Thread thread = new Thread() {
+//      {
+//        setDaemon(true);
+//      }
+//      public void run() {
+//        SolrDispatchFilter solrDispatchFilter = (SolrDispatchFilter) jetty.getDispatchFilter().getFilter();
+//        if (solrDispatchFilter != null) {
+//          CoreContainer cores = solrDispatchFilter.getCores();
+//          if (cores != null) {
+//            try {
+//              Thread.sleep(ZkTestServer.TICK_TIME * 2 + 800);
+//            } catch (InterruptedException e) {
+//              // we act as only connection loss
+//              return;
+//            }
+//            long sessionId = cores.getZkController().getZkClient().getSolrZooKeeper().getSessionId();
+//            zkServer.expire(sessionId);
+//          }
+//        }
+//      }
+//    };
+//    thread.start();
+
   }
   
   public void expireRandomSession() throws KeeperException, InterruptedException {
@@ -119,6 +151,10 @@ public class ChaosMonkey {
   }
   
   private void causeConnectionLoss(JettySolrRunner jetty) {
+    causeConnectionLoss(jetty, ZkTestServer.TICK_TIME * 2 + 200);
+  }
+  
+  private void causeConnectionLoss(JettySolrRunner jetty, int pauseTime) {
     SolrDispatchFilter solrDispatchFilter = (SolrDispatchFilter) jetty
         .getDispatchFilter().getFilter();
     if (solrDispatchFilter != null) {
@@ -126,7 +162,7 @@ public class ChaosMonkey {
       if (cores != null) {
         SolrZkClient zkClient = cores.getZkController().getZkClient();
         // must be at least double tick time...
-        zkClient.getSolrZooKeeper().pauseCnxn(ZkTestServer.TICK_TIME * 2 + 200);
+        zkClient.getSolrZooKeeper().pauseCnxn(pauseTime);
       }
     }
   }
@@ -291,6 +327,7 @@ public class ChaosMonkey {
       }
     }
     
+    // TODO: stale state makes this a tough call
     if (numActive < 2) {
       // we cannot kill anyone
       monkeyLog("only one active node in shard - monkey cannot kill :(");
@@ -308,8 +345,44 @@ public class ChaosMonkey {
       int index = random.nextInt(jetties.size());
       cjetty = jetties.get(index);
       
-      ZkNodeProps leader = zkStateReader.getLeaderProps(collection, slice);
-      boolean isLeader = leader.getStr(ZkStateReader.NODE_NAME_PROP).equals(jetties.get(index).nodeName);
+      ZkNodeProps leader = null;
+      try {
+        leader = zkStateReader.getLeaderProps(collection, slice);
+      } catch (Throwable t) {
+        log.error("Could not get leader", t);
+        return null;
+      }
+      
+      FilterHolder fh = cjetty.jetty.getDispatchFilter();
+      if (fh == null) {
+        monkeyLog("selected jetty not running correctly - skip");
+        return null;
+      }
+      SolrDispatchFilter df = ((SolrDispatchFilter) fh.getFilter());
+      if (df == null) {
+        monkeyLog("selected jetty not running correctly - skip");
+        return null;
+      }
+      CoreContainer cores = df.getCores();
+      if (cores == null) {
+        monkeyLog("selected jetty not running correctly - skip");
+        return null;
+      }
+      SolrCore core = cores.getCore(leader.getStr(ZkStateReader.CORE_NAME_PROP));
+      if (core == null) {
+        monkeyLog("selected jetty not running correctly - skip");
+        return null;
+      }
+      // cluster state can be stale - also go by our 'near real-time' is leader prop
+      boolean rtIsLeader;
+      try {
+        rtIsLeader = core.getCoreDescriptor().getCloudDescriptor().isLeader();
+      } finally {
+        core.close();
+      }
+      
+      boolean isLeader = leader.getStr(ZkStateReader.NODE_NAME_PROP).equals(jetties.get(index).nodeName)
+          || rtIsLeader;
       if (!aggressivelyKillLeaders && isLeader) {
         // we don't kill leaders...
         monkeyLog("abort! I don't kill leaders");
@@ -343,7 +416,7 @@ public class ChaosMonkey {
   
   // synchronously starts and stops shards randomly, unless there is only one
   // active shard up for a slice or if there is one active and others recovering
-  public void startTheMonkey(boolean killLeaders, final int roundPause) {
+  public void startTheMonkey(boolean killLeaders, final int roundPauseUpperLimit) {
     monkeyLog("starting");
     this.aggressivelyKillLeaders = killLeaders;
     startTime = System.currentTimeMillis();
@@ -357,8 +430,9 @@ public class ChaosMonkey {
       public void run() {
         while (!stop) {
           try {
-            Thread.sleep(roundPause);
+    
             Random random = LuceneTestCase.random();
+            Thread.sleep(random.nextInt(roundPauseUpperLimit));
             if (random.nextBoolean()) {
              if (!deadPool.isEmpty()) {
                int index = random.nextInt(deadPool.size());

Modified: lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/cloud/ZkTestServer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/cloud/ZkTestServer.java?rev=1384923&r1=1384922&r2=1384923&view=diff
==============================================================================
--- lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/cloud/ZkTestServer.java (original)
+++ lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/cloud/ZkTestServer.java Fri Sep 14 21:05:15 2012
@@ -43,7 +43,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public class ZkTestServer {
-  public static final int TICK_TIME = 3000;
+  public static final int TICK_TIME = 1000;
 
   private static Logger log = LoggerFactory.getLogger(ZkTestServer.class);