You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/04/24 20:12:38 UTC

svn commit: r1471573 - in /manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests: ThrottlingPostgresqlLT.java ThrottlingTester.java

Author: kwright
Date: Wed Apr 24 18:12:37 2013
New Revision: 1471573

URL: http://svn.apache.org/r1471573
Log:
Add a test for throttling that will hopefully reassure us there is no problem with this functionality.  Part of CONNECTORS-679.

Added:
    manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingPostgresqlLT.java   (with props)
    manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingTester.java   (with props)

Added: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingPostgresqlLT.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingPostgresqlLT.java?rev=1471573&view=auto
==============================================================================
--- manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingPostgresqlLT.java (added)
+++ manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingPostgresqlLT.java Wed Apr 24 18:12:37 2013
@@ -0,0 +1,61 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.webcrawler_tests;
+
+import java.io.*;
+import java.util.*;
+import org.junit.*;
+
+/** This is a very basic sanity check */
+public class ThrottlingPostgresqlLT extends BasePostgresql
+{
+
+  protected ThrottlingTester tester;
+  protected MockWebService webService = null;
+  
+  public ThrottlingPostgresqlLT()
+  {
+    tester = new ThrottlingTester(mcfInstance);
+  }
+  
+  // Setup and teardown the mock wiki service
+  
+  @Before
+  public void createWebService()
+    throws Exception
+  {
+    webService = new MockWebService(10);
+    webService.start();
+  }
+  
+  @After
+  public void shutdownWebService()
+    throws Exception
+  {
+    if (webService != null)
+      webService.stop();
+  }
+
+  @Test
+  public void bigCrawl()
+    throws Exception
+  {
+    tester.executeTest();
+  }
+}

Propchange: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingPostgresqlLT.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingPostgresqlLT.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingTester.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingTester.java?rev=1471573&view=auto
==============================================================================
--- manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingTester.java (added)
+++ manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingTester.java Wed Apr 24 18:12:37 2013
@@ -0,0 +1,159 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.webcrawler_tests;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.agents.interfaces.*;
+import org.apache.manifoldcf.crawler.interfaces.*;
+import org.apache.manifoldcf.crawler.system.ManifoldCF;
+
+import org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector;
+import org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig;
+
+import java.io.*;
+import java.util.*;
+
+/** This is a 10000-document crawl with throttling */
+public class ThrottlingTester
+{
+  protected org.apache.manifoldcf.crawler.tests.ManifoldCFInstance instance;
+  
+  public ThrottlingTester(org.apache.manifoldcf.crawler.tests.ManifoldCFInstance instance)
+  {
+    this.instance = instance;
+  }
+  
+  public void executeTest()
+    throws Exception
+  {
+    // Hey, we were able to install the connector etc.
+    // Now, create a local test job and run it.
+    IThreadContext tc = ThreadContextFactory.make();
+      
+    // Create a basic file system connection, and save it.
+    IRepositoryConnectionManager mgr = RepositoryConnectionManagerFactory.make(tc);
+    IRepositoryConnection conn = mgr.create();
+    conn.setName("Web Connection");
+    conn.setDescription("Web Connection");
+    conn.setClassName("org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector");
+    conn.setMaxConnections(100);
+    ConfigParams cp = conn.getConfigParams();
+    
+    cp.setParameter(WebcrawlerConfig.PARAMETER_EMAIL,"someone@somewhere.com");
+    cp.setParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE,"none");
+    
+    // Throttling
+    ConfigurationNode cn = new ConfigurationNode(WebcrawlerConfig.NODE_BINDESC);
+    cn.setAttribute(WebcrawlerConfig.ATTR_BINREGEXP,"");
+    
+    ConfigurationNode con = new ConfigurationNode(WebcrawlerConfig.NODE_MAXCONNECTIONS);
+    con.setAttribute(WebcrawlerConfig.ATTR_VALUE,"10");
+    cn.addChild(cn.getChildCount(),con);
+    
+    ConfigurationNode maxKB = new ConfigurationNode(WebcrawlerConfig.NODE_MAXKBPERSECOND);
+    maxKB.setAttribute(WebcrawlerConfig.ATTR_VALUE,"16");
+    cn.addChild(cn.getChildCount(),maxKB);
+    
+    ConfigurationNode maxFetches = new ConfigurationNode(WebcrawlerConfig.NODE_MAXFETCHESPERMINUTE);
+    maxFetches.setAttribute(WebcrawlerConfig.ATTR_VALUE,"12");
+    cn.addChild(cn.getChildCount(),maxFetches);
+    
+    cp.addChild(cp.getChildCount(),cn);
+    
+    // Now, save
+    mgr.save(conn);
+      
+    // Create a basic null output connection, and save it.
+    IOutputConnectionManager outputMgr = OutputConnectionManagerFactory.make(tc);
+    IOutputConnection outputConn = outputMgr.create();
+    outputConn.setName("Null Connection");
+    outputConn.setDescription("Null Connection");
+    outputConn.setClassName("org.apache.manifoldcf.agents.output.nullconnector.NullConnector");
+    outputConn.setMaxConnections(100);
+    // Now, save
+    outputMgr.save(outputConn);
+
+    // Create a job.
+    IJobManager jobManager = JobManagerFactory.make(tc);
+    IJobDescription job = jobManager.createJob();
+    job.setDescription("Test Job");
+    job.setConnectionName("Web Connection");
+    job.setOutputConnectionName("Null Connection");
+    job.setType(job.TYPE_SPECIFIED);
+    job.setStartMethod(job.START_DISABLE);
+    job.setHopcountMode(job.HOPCOUNT_ACCURATE);
+    job.addHopCountFilter("link",new Long(2));
+
+    // Now, set up the document specification.
+    DocumentSpecification ds = job.getSpecification();
+    
+    // Set up 100 seeds
+    SpecificationNode sn = new SpecificationNode(WebcrawlerConfig.NODE_SEEDS);
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0 ; i < 100 ; i++)
+    {
+      sb.append("http://localhost:8191/web/gen.php?site="+i+"&level=0&item=0\n");
+    }
+    sn.setValue(sb.toString());
+    ds.addChild(ds.getChildCount(),sn);
+    
+    sn = new SpecificationNode(WebcrawlerConfig.NODE_INCLUDES);
+    sn.setValue(".*\n");
+    ds.addChild(ds.getChildCount(),sn);
+    
+    sn = new SpecificationNode(WebcrawlerConfig.NODE_INCLUDESINDEX);
+    sn.setValue(".*\n");
+    ds.addChild(ds.getChildCount(),sn);
+
+    // Set up the output specification.
+    OutputSpecification os = job.getOutputSpecification();
+    // Null output connections have no output specification, so this is a no-op.
+    
+    // Save the job.
+    jobManager.save(job);
+
+    // Now, start the job, and wait until it completes.
+    long startTime = System.currentTimeMillis();
+    jobManager.manualStart(job.getID());
+    instance.waitJobInactiveNative(jobManager,job.getID(),220000000L);
+    System.err.println("Crawl required "+new Long(System.currentTimeMillis()-startTime).toString()+" milliseconds");
+
+    // Check to be sure we actually processed the right number of documents.
+    JobStatus status = jobManager.getStatus(job.getID());
+    // Four levels deep from 10 site seeds: Each site seed has 1 + 10 + 100 + 1000 = 1111 documents, so 100 has 111100
+    if (status.getDocumentsProcessed() != 111100)
+    {
+      System.err.println("Sleeping for database inspection");
+      while (true)
+      {
+        if (1 < 0)
+          break;
+        Thread.sleep(10000L);
+      }
+      throw new ManifoldCFException("Wrong number of documents processed - expected 111100, saw "+new Long(status.getDocumentsProcessed()).toString());
+    }
+    
+    // Now, delete the job.
+    jobManager.deleteJob(job.getID());
+    instance.waitJobDeletedNative(jobManager,job.getID(),18000000L);
+      
+    // Cleanup is automatic by the base class, so we can feel free to leave jobs and connections lying around.
+  }
+  
+}

Propchange: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingTester.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/ThrottlingTester.java
------------------------------------------------------------------------------
    svn:keywords = Id