You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/09/02 17:44:29 UTC

svn commit: r439610 - in /lucene/nutch/trunk/src: test/ test/org/apache/nutch/crawl/ test/org/apache/nutch/fetcher/ testresources/ testresources/fetch-test-site/

Author: siren
Date: Sat Sep  2 08:44:28 2006
New Revision: 439610

URL: http://svn.apache.org/viewvc?rev=439610&view=rev
Log:
add simple junit test for fetcher

Added:
    lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
    lucene/nutch/trunk/src/testresources/
    lucene/nutch/trunk/src/testresources/fetch-test-site/
    lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html
    lucene/nutch/trunk/src/testresources/fetch-test-site/index.html
    lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html
    lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html
    lucene/nutch/trunk/src/testresources/fetch-test-site/robots.txt
Modified:
    lucene/nutch/trunk/src/test/crawl-tests.xml
    lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
    lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
    lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java

Modified: lucene/nutch/trunk/src/test/crawl-tests.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/crawl-tests.xml?rev=439610&r1=439609&r2=439610&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/crawl-tests.xml (original)
+++ lucene/nutch/trunk/src/test/crawl-tests.xml Sat Sep  2 08:44:28 2006
@@ -6,8 +6,33 @@
 
 <property>
   <name>plugin.includes</name>
-  <value>urlfilter-suffix|scoring-opic</value>
+  <value>parse-html|protocol-http|urlfilter-suffix|scoring-opic</value>
   <description>Enable required plugins.</description>
+</property>
+
+<property>
+  <name>content.server.port</name>
+  <value>55000</value>
+  <description>Port of http server serving content.</description>
+</property>
+
+<property>
+  <name>fetcher.server.delay</name>
+  <value>1.0</value>
+  <description>The number of seconds the fetcher will delay between 
+   successive requests to the same server.</description>
+</property>
+
+<property>
+  <name>fetcher.server.delay</name>
+  <value>1.0</value>
+  <description>The number of seconds the fetcher will delay between 
+   successive requests to the same server.</description>
+</property>
+
+<property>
+  <name>http.agent.name</name>
+  <value>test-nutch</value>
 </property>
 
 </configuration>

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?rev=439610&r1=439609&r2=439610&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Sat Sep  2 08:44:28 2006
@@ -15,16 +15,24 @@
  */
 package org.apache.nutch.crawl;
 
+import java.io.File;
+import java.io.IOException;
+import java.net.UnknownHostException;
 import java.util.Iterator;
 import java.util.List;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
 import org.apache.hadoop.io.UTF8;
+import org.mortbay.http.HttpContext;
+import org.mortbay.http.SocketListener;
+import org.mortbay.http.handler.ResourceHandler;
+import org.mortbay.jetty.Server;
 
 public class CrawlDBTestUtil {
 
@@ -62,8 +70,20 @@
    * set values.
    * 
    * @return
+   * @deprecated Use {@link #createConfiguration()} instead
    */
   public static Configuration create(){
+    return createConfiguration();
+  }
+
+  /**
+   * For now we need to manually construct our Configuration, because we need to
+   * override the default one and it is currently not possible to use dynamically
+   * set values.
+   * 
+   * @return
+   */
+  public static Configuration createConfiguration(){
     Configuration conf=new Configuration();
     conf.addDefaultResource("nutch-default.xml");
     conf.addFinalResource("crawl-tests.xml");
@@ -80,5 +100,45 @@
       this.url = url;
       this.datum = datum;
     }
+  }
+  
+  /**
+   * Generate seedlist
+   * @throws IOException 
+   */
+  public static void generateSeedList(FileSystem fs, Path urlPath, List<String> contents) throws IOException{
+    FSDataOutputStream out;
+    Path file=new Path(urlPath,"urls.txt");
+    fs.mkdirs(urlPath);
+    out=fs.create(file);
+    Iterator<String> iterator=contents.iterator();
+    while(iterator.hasNext()){
+      String url=iterator.next();
+      out.writeBytes(url);
+      out.writeBytes("\n");
+    }
+    out.flush();
+    out.close();
+  }
+  
+  /**
+   * Creates a new JettyServer with one static root context
+   * 
+   * @param port port to listen to
+   * @param staticContent folder where static content lives
+   * @throws UnknownHostException 
+   */
+  public static Server getServer(int port, String staticContent) throws UnknownHostException{
+    Server webServer = new org.mortbay.jetty.Server();
+    SocketListener listener = new SocketListener();
+    listener.setPort(port);
+    listener.setHost("127.0.0.1");
+    webServer.addListener(listener);
+    HttpContext staticContext = new HttpContext();
+    staticContext.setContextPath("/");
+    staticContext.setResourceBase(staticContent);
+    staticContext.addHandler(new ResourceHandler());
+    webServer.addContext(staticContext);
+    return webServer;
   }
 }

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=439610&r1=439609&r2=439610&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Sat Sep  2 08:44:28 2006
@@ -50,7 +50,7 @@
   FileSystem fs;
 
   protected void setUp() throws Exception {
-    conf = CrawlDBTestUtil.create();
+    conf = CrawlDBTestUtil.createConfiguration();
   }
 
   protected void tearDown() {

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java?rev=439610&r1=439609&r2=439610&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Sat Sep  2 08:44:28 2006
@@ -18,11 +18,9 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.Iterator;
 import java.util.List;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
@@ -42,7 +40,6 @@
  */
 public class TestInjector extends TestCase {
 
-  private FSDataOutputStream out;
   private Configuration conf;
   private FileSystem fs;
   final static Path testdir=new Path("build/test/inject-test");
@@ -50,7 +47,7 @@
   Path urlPath;
   
   protected void setUp() throws Exception {
-    conf = CrawlDBTestUtil.create();
+    conf = CrawlDBTestUtil.createConfiguration();
     urlPath=new Path(testdir,"urls");
     crawldbPath=new Path(testdir,"crawldb");
     fs=FileSystem.get(conf);
@@ -66,7 +63,7 @@
     for(int i=0;i<100;i++) {
       urls.add("http://zzz/" + i + ".html");
     }
-    generateSeedList(urls);
+    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
     
     Injector injector=new Injector(conf);
     injector.inject(crawldbPath, urlPath);
@@ -87,7 +84,7 @@
     for(int i=0;i<100;i++) {
       urls2.add("http://xxx/" + i + ".html");
     }
-    generateSeedList(urls2);
+    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2);
     injector.inject(crawldbPath, urlPath);
     urls.addAll(urls2);
     
@@ -103,24 +100,6 @@
     assertTrue(read.containsAll(urls));
     assertTrue(urls.containsAll(read));
     
-  }
-  
-  /**
-   * Generate seedlist
-   * @throws IOException 
-   */
-  private void generateSeedList(List<String> contents) throws IOException{
-    Path file=new Path(urlPath,"urls.txt");
-    fs.mkdirs(urlPath);
-    out=fs.create(file);
-    Iterator<String> iterator=contents.iterator();
-    while(iterator.hasNext()){
-      String url=iterator.next();
-      out.writeBytes(url);
-      out.writeBytes("\n");
-    }
-    out.flush();
-    out.close();
   }
   
   private List<String> readCrawldb() throws IOException{

Added: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=439610&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Sat Sep  2 08:44:28 2006
@@ -0,0 +1,117 @@
+package org.apache.nutch.fetcher;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.crawl.CrawlDBTestUtil;
+import org.apache.nutch.crawl.Generator;
+import org.apache.nutch.crawl.Injector;
+import org.apache.nutch.protocol.Content;
+import org.mortbay.jetty.Server;
+
+import junit.framework.TestCase;
+
+/**
+ * Basic fetcher test
+ * 1. generate seedlist
+ * 2. inject
+ * 3. generate
+ * 3. fetch
+ * 4. Verify contents
+ * @author nutch-dev <nutch-dev at lucene.apache.org>
+ *
+ */
+public class TestFetcher extends TestCase {
+
+  final static Path testdir=new Path("build/test/fetch-test");
+  Configuration conf;
+  FileSystem fs;
+  Path crawldbPath;
+  Path segmentsPath;
+  Path urlPath;
+  Server server;
+
+  protected void setUp() throws Exception{
+    conf=CrawlDBTestUtil.createConfiguration();
+    fs=FileSystem.get(conf);
+    fs.delete(testdir);
+    urlPath=new Path(testdir,"urls");
+    crawldbPath=new Path(testdir,"crawldb");
+    segmentsPath=new Path(testdir,"segments");
+    server=CrawlDBTestUtil.getServer(conf.getInt("content.server.port",50000), "build/test/data/fetch-test-site");
+    server.start();
+  }
+
+  protected void tearDown() throws InterruptedException, IOException{
+    server.stop();
+  }
+  
+  public void testFetch() throws IOException {
+    
+    //generate seedlist
+    ArrayList<String> urls=new ArrayList<String>();
+    
+    addUrl(urls,"index.html");
+    addUrl(urls,"pagea.html");
+    addUrl(urls,"pageb.html");
+    addUrl(urls,"dup_of_pagea.html");
+    
+    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
+    
+    //inject
+    Injector injector=new Injector(conf);
+    injector.inject(crawldbPath, urlPath);
+
+    //generate
+    Generator g=new Generator(conf);
+    Path generatedSegment=g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE);
+
+    long time=System.currentTimeMillis();
+    //fetch
+    Fetcher fetcher=new Fetcher(conf);
+    fetcher.fetch(generatedSegment, 1, true);
+
+    time=System.currentTimeMillis()-time;
+    
+    //verify politeness, time taken should be more than (num_of_pages +1)*delay
+    assertTrue(1000*time > (urls.size() + 1 * conf.getInt("fetcher.server.delay",5)));
+    
+    //verify results
+    Path content=new Path(new Path(generatedSegment, Content.DIR_NAME),"part-00000/data");
+    SequenceFile.Reader reader=new SequenceFile.Reader(fs, content, conf);
+    
+    ArrayList<String> handledurls=new ArrayList<String>();
+    
+    READ:
+      do {
+      UTF8 key=new UTF8();
+      Content value=new Content();
+      if(!reader.next(key, value)) break READ;
+      handledurls.add(key.toString());
+    } while(true);
+
+    reader.close();
+
+    Collections.sort(urls);
+    Collections.sort(handledurls);
+
+    //verify that enough pages were handled
+    assertEquals(urls.size(), handledurls.size());
+
+    //verify that correct pages were handled
+    assertTrue(handledurls.containsAll(urls));
+    assertTrue(urls.containsAll(handledurls));
+
+  }
+
+  private void addUrl(ArrayList<String> urls, String page) {
+    urls.add("http://127.0.0.1:" + server.getListeners()[0].getPort() + "/" + page);
+  }
+
+}

Added: lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html?rev=439610&view=auto
==============================================================================
--- lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html (added)
+++ lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html Sat Sep  2 08:44:28 2006
@@ -0,0 +1,9 @@
+<html>
+ <head>
+  <title>page a</title>
+ </head>
+<body>
+This is page a
+<a href="index.html">home</a>
+</body>
+</html>
\ No newline at end of file

Added: lucene/nutch/trunk/src/testresources/fetch-test-site/index.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/index.html?rev=439610&view=auto
==============================================================================
--- lucene/nutch/trunk/src/testresources/fetch-test-site/index.html (added)
+++ lucene/nutch/trunk/src/testresources/fetch-test-site/index.html Sat Sep  2 08:44:28 2006
@@ -0,0 +1,11 @@
+<html>
+ <head>
+  <title>front page</title>
+ </head>
+<body>
+This is front page.
+<a href="pagea.html">Page a</a>
+<a href="pageb.html">Page b</a>
+<a href="dup_of_pagea.html">dup of Page a</a>
+</body>
+</html>
\ No newline at end of file

Added: lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html?rev=439610&view=auto
==============================================================================
--- lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html (added)
+++ lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html Sat Sep  2 08:44:28 2006
@@ -0,0 +1,9 @@
+<html>
+ <head>
+  <title>page a</title>
+ </head>
+<body>
+This is page a
+<a href="index.html">home</a>
+</body>
+</html>
\ No newline at end of file

Added: lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html?rev=439610&view=auto
==============================================================================
--- lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html (added)
+++ lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html Sat Sep  2 08:44:28 2006
@@ -0,0 +1,9 @@
+<html>
+ <head>
+  <title>bage b</title>
+ </head>
+<body>
+This is page b
+<a href="index.html">home</a>
+</body>
+</html>
\ No newline at end of file

Added: lucene/nutch/trunk/src/testresources/fetch-test-site/robots.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/robots.txt?rev=439610&view=auto
==============================================================================
    (empty)