You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/04/29 18:10:16 UTC

svn commit: r1477170 - in /manifoldcf/trunk: ./ tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/

Author: kwright
Date: Mon Apr 29 16:10:10 2013
New Revision: 1477170

URL: http://svn.apache.org/r1477170
Log:
Load test which exercises throttling on a mixed set of pages, some of which have errors, and restarts the crawl often.

Added:
    manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingDerbyLT.java   (with props)
Modified:
    manifoldcf/trunk/build.xml
    manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockWebService.java
    manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingPostgresqlLT.java
    manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingTester.java

Modified: manifoldcf/trunk/build.xml
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/build.xml?rev=1477170&r1=1477169&r2=1477170&view=diff
==============================================================================
--- manifoldcf/trunk/build.xml (original)
+++ manifoldcf/trunk/build.xml Mon Apr 29 16:10:10 2013
@@ -2307,6 +2307,10 @@
         <ant dir="tests/webcrawler" target="run-postgresql"/>
     </target>
 
+    <target name="run-webcrawler-loadtests-derby" depends="build-tests-framework,build-tests-webcrawler-connector,build-tests-nulloutput-connector,calculate-webcrawler-tests-condition" if="webcrawler-tests.include">
+        <ant dir="tests/webcrawler" target="run-load-derby"/>
+    </target>
+
     <target name="run-webcrawler-loadtests-postgresql" depends="build-tests-framework,build-tests-webcrawler-connector,build-tests-nulloutput-connector,calculate-webcrawler-tests-condition" if="webcrawler-tests.include">
         <ant dir="tests/webcrawler" target="run-load-postgresql"/>
     </target>

Modified: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockWebService.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockWebService.java?rev=1477170&r1=1477169&r2=1477170&view=diff
==============================================================================
--- manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockWebService.java (original)
+++ manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockWebService.java Mon Apr 29 16:10:10 2013
@@ -36,12 +36,17 @@ public class MockWebService
 {
   Server server;
   WebServlet servlet;
-    
+  
   public MockWebService(int docsPerLevel)
   {
+    this(docsPerLevel, 10, false);
+  }
+  
+  public MockWebService(int docsPerLevel, int maxLevels, boolean generateBadPages)
+  {
     server = new Server(8191);
     server.setThreadPool(new QueuedThreadPool(100));
-    servlet = new WebServlet(docsPerLevel);
+    servlet = new WebServlet(docsPerLevel, maxLevels, generateBadPages);
     ServletContextHandler context = new ServletContextHandler(ServletContextHandler.SESSIONS);
     context.setContextPath("/web");
     server.setHandler(context);
@@ -61,11 +66,15 @@ public class MockWebService
   
   public static class WebServlet extends HttpServlet
   {
-    int docsPerLevel;
+    final int docsPerLevel;
+    final int maxLevels;
+    final boolean generateBadPages;
     
-    public WebServlet(int docsPerLevel)
+    public WebServlet(int docsPerLevel, int maxLevels, boolean generateBadPages)
     {
       this.docsPerLevel = docsPerLevel;
+      this.maxLevels = maxLevels;
+      this.generateBadPages = generateBadPages;
     }
     
     @Override
@@ -96,7 +105,9 @@ public class MockWebService
         {
           throw new IOException("Level number must be a number: "+level);
         }
-        
+        if (theLevel >= maxLevels)
+          throw new IOException("Level number too big.");
+
         int theItem;
         try
         {
@@ -119,43 +130,55 @@ public class MockWebService
           throw new IOException("Doc number too big: "+theItem+" ; level "+theLevel+" ; docsPerLevel "+docsPerLevel);
 
         // Generate the page
-        res.setStatus(HttpServletResponse.SC_OK);
-        res.setContentType("text/html; charset=utf-8");
-        res.getWriter().printf("<html>\n");
-        res.getWriter().printf("  <body>\n");
-
-        res.getWriter().printf("This is doc number "+theItem+" and level number "+theLevel+" in site "+site+"\n");
-
-        // Generate links to all parents
-        int parentLevel = theLevel;
-        int parentItem = theItem;
-        while (parentLevel > 0)
-        {
-          parentLevel--;
-          parentItem /= docsPerLevel;
-          generateLink(res,site,parentLevel,parentItem);
-        }
-        
-        // Temporary: Prevent links to children deeper than a certain level; this is to help
-        // the debug process
-        if (theLevel < 9)
+        if (generateBadPages && (theItem % 2) == 1)
         {
-          // Generate links to direct children
-          for (int i = 0; i < docsPerLevel; i++)
+          // Generate a bad page.  This is a page with a non-200 return code, and with some content
+          // > 1024 characters
+          res.setStatus(HttpServletResponse.SC_UNAUTHORIZED);
+          res.getWriter().printf("This is the error message for a 401 page.");
+          for (int i = 0; i < 100; i++)
           {
-            int docNumber = i + theItem * docsPerLevel;
-            generateLink(res,site,theLevel+1,docNumber);
+            res.getWriter().printf(" Error message # "+i);
           }
         }
-        
-        // Generate some limited cross-links to other items at this level
-        for (int i = theItem; i < maxDocsThisLevel && i < theItem + docsPerLevel; i++)
+        else
         {
-          generateLink(res,site,theLevel,i);
+          res.setStatus(HttpServletResponse.SC_OK);
+          res.setContentType("text/html; charset=utf-8");
+          res.getWriter().printf("<html>\n");
+          res.getWriter().printf("  <body>\n");
+
+          res.getWriter().printf("This is doc number "+theItem+" and level number "+theLevel+" in site "+site+"\n");
+
+          // Generate links to all parents
+          int parentLevel = theLevel;
+          int parentItem = theItem;
+          while (parentLevel > 0)
+          {
+            parentLevel--;
+            parentItem /= docsPerLevel;
+            generateLink(res,site,parentLevel,parentItem);
+          }
+          
+          if (theLevel < maxLevels-1)
+          {
+            // Generate links to direct children
+            for (int i = 0; i < docsPerLevel; i++)
+            {
+              int docNumber = i + theItem * docsPerLevel;
+              generateLink(res,site,theLevel+1,docNumber);
+            }
+          }
+          
+          // Generate some limited cross-links to other items at this level
+          for (int i = theItem; i < maxDocsThisLevel && i < theItem + docsPerLevel; i++)
+          {
+            generateLink(res,site,theLevel,i);
+          }
+          
+          res.getWriter().printf("  </body>\n");
+          res.getWriter().printf("</html>\n");
         }
-        
-        res.getWriter().printf("  </body>\n");
-        res.getWriter().printf("</html>\n");
         res.getWriter().flush();
       }
       catch (IOException e)

Added: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingDerbyLT.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingDerbyLT.java?rev=1477170&view=auto
==============================================================================
--- manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingDerbyLT.java (added)
+++ manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingDerbyLT.java Mon Apr 29 16:10:10 2013
@@ -0,0 +1,61 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.webcrawler_tests;
+
+import java.io.*;
+import java.util.*;
+import org.junit.*;
+
+/** This is a very basic sanity check */
+public class ThrottlingDerbyLT extends BaseDerby
+{
+
+  protected ThrottlingTester tester;
+  protected MockWebService webService = null;
+  
+  public ThrottlingDerbyLT()
+  {
+    tester = new ThrottlingTester(mcfInstance);
+  }
+  
+  // Setup and teardown the mock wiki service
+  
+  @Before
+  public void createWebService()
+    throws Exception
+  {
+    webService = new MockWebService(10,2,true);
+    webService.start();
+  }
+  
+  @After
+  public void shutdownWebService()
+    throws Exception
+  {
+    if (webService != null)
+      webService.stop();
+  }
+
+  @Test
+  public void bigCrawl()
+    throws Exception
+  {
+    tester.executeTest();
+  }
+}

Propchange: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingDerbyLT.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingDerbyLT.java
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingPostgresqlLT.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingPostgresqlLT.java?rev=1477170&r1=1477169&r2=1477170&view=diff
==============================================================================
--- manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingPostgresqlLT.java (original)
+++ manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingPostgresqlLT.java Mon Apr 29 16:10:10 2013
@@ -40,7 +40,7 @@ public class ThrottlingPostgresqlLT exte
   public void createWebService()
     throws Exception
   {
-    webService = new MockWebService(10);
+    webService = new MockWebService(10,2,true);
     webService.start();
   }
   

Modified: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingTester.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingTester.java?rev=1477170&r1=1477169&r2=1477170&view=diff
==============================================================================
--- manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingTester.java (original)
+++ manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingTester.java Mon Apr 29 16:10:10 2013
@@ -29,7 +29,7 @@ import org.apache.manifoldcf.crawler.con
 import java.io.*;
 import java.util.*;
 
-/** This is a 10000-document crawl with throttling */
+/** This is a repeated 100-document crawl with throttling */
 public class ThrottlingTester
 {
   protected org.apache.manifoldcf.crawler.tests.ManifoldCFInstance instance;
@@ -97,8 +97,7 @@ public class ThrottlingTester
     job.setOutputConnectionName("Null Connection");
     job.setType(job.TYPE_SPECIFIED);
     job.setStartMethod(job.START_DISABLE);
-    job.setHopcountMode(job.HOPCOUNT_ACCURATE);
-    job.addHopCountFilter("link",new Long(2));
+    job.setHopcountMode(job.HOPCOUNT_NEVERDELETE);
 
     // Now, set up the document specification.
     DocumentSpecification ds = job.getSpecification();
@@ -106,7 +105,7 @@ public class ThrottlingTester
     // Set up 100 seeds
     SpecificationNode sn = new SpecificationNode(WebcrawlerConfig.NODE_SEEDS);
     StringBuilder sb = new StringBuilder();
-    for (int i = 0 ; i < 100 ; i++)
+    for (int i = 0 ; i < 10 ; i++)
     {
       sb.append("http://localhost:8191/web/gen.php?site="+i+"&level=0&item=0\n");
     }
@@ -128,21 +127,23 @@ public class ThrottlingTester
     // Save the job.
     jobManager.save(job);
 
-    // Now, start the job, and wait until it completes.
-    long startTime = System.currentTimeMillis();
-    jobManager.manualStart(job.getID());
-    instance.waitJobInactiveNative(jobManager,job.getID(),220000000L);
-    System.err.println("Crawl required "+new Long(System.currentTimeMillis()-startTime).toString()+" milliseconds");
-
-    // Check to be sure we actually processed the right number of documents.
-    JobStatus status = jobManager.getStatus(job.getID());
-    // Four levels deep from 100 site seeds: Each site seed has 1 + 10 + 100 + 1000 = 1111 documents, so 100 has 111100 total, and 11100 processed
-    if (status.getDocumentsProcessed() != 11100)
-      throw new ManifoldCFException("Wrong number of documents processed - expected 111100, saw "+new Long(status.getDocumentsProcessed()).toString());
+    for (int i = 0; i < 100; i++)
+    {
+      System.err.println("Iteration # "+i);
+      // Now, start the job, and wait until it completes.
+      long startTime = System.currentTimeMillis();
+      jobManager.manualStart(job.getID());
+      instance.waitJobInactiveNative(jobManager,job.getID(),300000L);
+      System.err.println(" Crawl required "+new Long(System.currentTimeMillis()-startTime).toString()+" milliseconds");
+
+      // Check to be sure we actually processed the right number of documents.
+      JobStatus status = jobManager.getStatus(job.getID());
+      System.err.println(" "+new Long(status.getDocumentsProcessed())+" documents processed");
+    }
     
     // Now, delete the job.
     jobManager.deleteJob(job.getID());
-    instance.waitJobDeletedNative(jobManager,job.getID(),18000000L);
+    instance.waitJobDeletedNative(jobManager,job.getID(),300000L);
       
     // Cleanup is automatic by the base class, so we can feel free to leave jobs and connections lying around.
   }