You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/09/02 17:44:29 UTC
svn commit: r439610 - in /lucene/nutch/trunk/src: test/
test/org/apache/nutch/crawl/ test/org/apache/nutch/fetcher/ testresources/
testresources/fetch-test-site/
Author: siren
Date: Sat Sep 2 08:44:28 2006
New Revision: 439610
URL: http://svn.apache.org/viewvc?rev=439610&view=rev
Log:
add simple junit test for fetcher
Added:
lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
lucene/nutch/trunk/src/testresources/
lucene/nutch/trunk/src/testresources/fetch-test-site/
lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html
lucene/nutch/trunk/src/testresources/fetch-test-site/index.html
lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html
lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html
lucene/nutch/trunk/src/testresources/fetch-test-site/robots.txt
Modified:
lucene/nutch/trunk/src/test/crawl-tests.xml
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
Modified: lucene/nutch/trunk/src/test/crawl-tests.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/crawl-tests.xml?rev=439610&r1=439609&r2=439610&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/crawl-tests.xml (original)
+++ lucene/nutch/trunk/src/test/crawl-tests.xml Sat Sep 2 08:44:28 2006
@@ -6,8 +6,33 @@
<property>
<name>plugin.includes</name>
- <value>urlfilter-suffix|scoring-opic</value>
+ <value>parse-html|protocol-http|urlfilter-suffix|scoring-opic</value>
<description>Enable required plugins.</description>
+</property>
+
+<property>
+ <name>content.server.port</name>
+ <value>55000</value>
+ <description>Port of http server serving content.</description>
+</property>
+
+<property>
+ <name>fetcher.server.delay</name>
+ <value>1.0</value>
+ <description>The number of seconds the fetcher will delay between
+ successive requests to the same server.</description>
+</property>
+
+<property>
+ <name>fetcher.server.delay</name>
+ <value>1.0</value>
+ <description>The number of seconds the fetcher will delay between
+ successive requests to the same server.</description>
+</property>
+
+<property>
+ <name>http.agent.name</name>
+ <value>test-nutch</value>
</property>
</configuration>
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?rev=439610&r1=439609&r2=439610&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Sat Sep 2 08:44:28 2006
@@ -15,16 +15,24 @@
*/
package org.apache.nutch.crawl;
+import java.io.File;
+import java.io.IOException;
+import java.net.UnknownHostException;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.UTF8;
+import org.mortbay.http.HttpContext;
+import org.mortbay.http.SocketListener;
+import org.mortbay.http.handler.ResourceHandler;
+import org.mortbay.jetty.Server;
public class CrawlDBTestUtil {
@@ -62,8 +70,20 @@
* set values.
*
* @return
+ * @deprecated Use {@link #createConfiguration()} instead
*/
public static Configuration create(){
+ return createConfiguration();
+ }
+
+ /**
+ * For now we need to manually construct our Configuration, because we need to
+ * override the default one and it is currently not possible to use dynamically
+ * set values.
+ *
+ * @return
+ */
+ public static Configuration createConfiguration(){
Configuration conf=new Configuration();
conf.addDefaultResource("nutch-default.xml");
conf.addFinalResource("crawl-tests.xml");
@@ -80,5 +100,45 @@
this.url = url;
this.datum = datum;
}
+ }
+
+ /**
+ * Generate seedlist
+ * @throws IOException
+ */
+ public static void generateSeedList(FileSystem fs, Path urlPath, List<String> contents) throws IOException{
+ FSDataOutputStream out;
+ Path file=new Path(urlPath,"urls.txt");
+ fs.mkdirs(urlPath);
+ out=fs.create(file);
+ Iterator<String> iterator=contents.iterator();
+ while(iterator.hasNext()){
+ String url=iterator.next();
+ out.writeBytes(url);
+ out.writeBytes("\n");
+ }
+ out.flush();
+ out.close();
+ }
+
+ /**
+ * Creates a new JettyServer with one static root context
+ *
+ * @param port port to listen to
+ * @param staticContent folder where static content lives
+ * @throws UnknownHostException
+ */
+ public static Server getServer(int port, String staticContent) throws UnknownHostException{
+ Server webServer = new org.mortbay.jetty.Server();
+ SocketListener listener = new SocketListener();
+ listener.setPort(port);
+ listener.setHost("127.0.0.1");
+ webServer.addListener(listener);
+ HttpContext staticContext = new HttpContext();
+ staticContext.setContextPath("/");
+ staticContext.setResourceBase(staticContent);
+ staticContext.addHandler(new ResourceHandler());
+ webServer.addContext(staticContext);
+ return webServer;
}
}
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=439610&r1=439609&r2=439610&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Sat Sep 2 08:44:28 2006
@@ -50,7 +50,7 @@
FileSystem fs;
protected void setUp() throws Exception {
- conf = CrawlDBTestUtil.create();
+ conf = CrawlDBTestUtil.createConfiguration();
}
protected void tearDown() {
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java?rev=439610&r1=439609&r2=439610&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Sat Sep 2 08:44:28 2006
@@ -18,11 +18,9 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
-import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
@@ -42,7 +40,6 @@
*/
public class TestInjector extends TestCase {
- private FSDataOutputStream out;
private Configuration conf;
private FileSystem fs;
final static Path testdir=new Path("build/test/inject-test");
@@ -50,7 +47,7 @@
Path urlPath;
protected void setUp() throws Exception {
- conf = CrawlDBTestUtil.create();
+ conf = CrawlDBTestUtil.createConfiguration();
urlPath=new Path(testdir,"urls");
crawldbPath=new Path(testdir,"crawldb");
fs=FileSystem.get(conf);
@@ -66,7 +63,7 @@
for(int i=0;i<100;i++) {
urls.add("http://zzz/" + i + ".html");
}
- generateSeedList(urls);
+ CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
Injector injector=new Injector(conf);
injector.inject(crawldbPath, urlPath);
@@ -87,7 +84,7 @@
for(int i=0;i<100;i++) {
urls2.add("http://xxx/" + i + ".html");
}
- generateSeedList(urls2);
+ CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2);
injector.inject(crawldbPath, urlPath);
urls.addAll(urls2);
@@ -103,24 +100,6 @@
assertTrue(read.containsAll(urls));
assertTrue(urls.containsAll(read));
- }
-
- /**
- * Generate seedlist
- * @throws IOException
- */
- private void generateSeedList(List<String> contents) throws IOException{
- Path file=new Path(urlPath,"urls.txt");
- fs.mkdirs(urlPath);
- out=fs.create(file);
- Iterator<String> iterator=contents.iterator();
- while(iterator.hasNext()){
- String url=iterator.next();
- out.writeBytes(url);
- out.writeBytes("\n");
- }
- out.flush();
- out.close();
}
private List<String> readCrawldb() throws IOException{
Added: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=439610&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Sat Sep 2 08:44:28 2006
@@ -0,0 +1,117 @@
+package org.apache.nutch.fetcher;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.crawl.CrawlDBTestUtil;
+import org.apache.nutch.crawl.Generator;
+import org.apache.nutch.crawl.Injector;
+import org.apache.nutch.protocol.Content;
+import org.mortbay.jetty.Server;
+
+import junit.framework.TestCase;
+
+/**
+ * Basic fetcher test
+ * 1. generate seedlist
+ * 2. inject
+ * 3. generate
+ * 3. fetch
+ * 4. Verify contents
+ * @author nutch-dev <nutch-dev at lucene.apache.org>
+ *
+ */
+public class TestFetcher extends TestCase {
+
+ final static Path testdir=new Path("build/test/fetch-test");
+ Configuration conf;
+ FileSystem fs;
+ Path crawldbPath;
+ Path segmentsPath;
+ Path urlPath;
+ Server server;
+
+ protected void setUp() throws Exception{
+ conf=CrawlDBTestUtil.createConfiguration();
+ fs=FileSystem.get(conf);
+ fs.delete(testdir);
+ urlPath=new Path(testdir,"urls");
+ crawldbPath=new Path(testdir,"crawldb");
+ segmentsPath=new Path(testdir,"segments");
+ server=CrawlDBTestUtil.getServer(conf.getInt("content.server.port",50000), "build/test/data/fetch-test-site");
+ server.start();
+ }
+
+ protected void tearDown() throws InterruptedException, IOException{
+ server.stop();
+ }
+
+ public void testFetch() throws IOException {
+
+ //generate seedlist
+ ArrayList<String> urls=new ArrayList<String>();
+
+ addUrl(urls,"index.html");
+ addUrl(urls,"pagea.html");
+ addUrl(urls,"pageb.html");
+ addUrl(urls,"dup_of_pagea.html");
+
+ CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
+
+ //inject
+ Injector injector=new Injector(conf);
+ injector.inject(crawldbPath, urlPath);
+
+ //generate
+ Generator g=new Generator(conf);
+ Path generatedSegment=g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE);
+
+ long time=System.currentTimeMillis();
+ //fetch
+ Fetcher fetcher=new Fetcher(conf);
+ fetcher.fetch(generatedSegment, 1, true);
+
+ time=System.currentTimeMillis()-time;
+
+ //verify politeness, time taken should be more than (num_of_pages +1)*delay
+ assertTrue(1000*time > (urls.size() + 1 * conf.getInt("fetcher.server.delay",5)));
+
+ //verify results
+ Path content=new Path(new Path(generatedSegment, Content.DIR_NAME),"part-00000/data");
+ SequenceFile.Reader reader=new SequenceFile.Reader(fs, content, conf);
+
+ ArrayList<String> handledurls=new ArrayList<String>();
+
+ READ:
+ do {
+ UTF8 key=new UTF8();
+ Content value=new Content();
+ if(!reader.next(key, value)) break READ;
+ handledurls.add(key.toString());
+ } while(true);
+
+ reader.close();
+
+ Collections.sort(urls);
+ Collections.sort(handledurls);
+
+ //verify that enough pages were handled
+ assertEquals(urls.size(), handledurls.size());
+
+ //verify that correct pages were handled
+ assertTrue(handledurls.containsAll(urls));
+ assertTrue(urls.containsAll(handledurls));
+
+ }
+
+ private void addUrl(ArrayList<String> urls, String page) {
+ urls.add("http://127.0.0.1:" + server.getListeners()[0].getPort() + "/" + page);
+ }
+
+}
Added: lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html?rev=439610&view=auto
==============================================================================
--- lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html (added)
+++ lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html Sat Sep 2 08:44:28 2006
@@ -0,0 +1,9 @@
+<html>
+ <head>
+ <title>page a</title>
+ </head>
+<body>
+This is page a
+<a href="index.html">home</a>
+</body>
+</html>
\ No newline at end of file
Added: lucene/nutch/trunk/src/testresources/fetch-test-site/index.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/index.html?rev=439610&view=auto
==============================================================================
--- lucene/nutch/trunk/src/testresources/fetch-test-site/index.html (added)
+++ lucene/nutch/trunk/src/testresources/fetch-test-site/index.html Sat Sep 2 08:44:28 2006
@@ -0,0 +1,11 @@
+<html>
+ <head>
+ <title>front page</title>
+ </head>
+<body>
+This is front page.
+<a href="pagea.html">Page a</a>
+<a href="pageb.html">Page b</a>
+<a href="dup_of_pagea.html">dup of Page a</a>
+</body>
+</html>
\ No newline at end of file
Added: lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html?rev=439610&view=auto
==============================================================================
--- lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html (added)
+++ lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html Sat Sep 2 08:44:28 2006
@@ -0,0 +1,9 @@
+<html>
+ <head>
+ <title>page a</title>
+ </head>
+<body>
+This is page a
+<a href="index.html">home</a>
+</body>
+</html>
\ No newline at end of file
Added: lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html?rev=439610&view=auto
==============================================================================
--- lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html (added)
+++ lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html Sat Sep 2 08:44:28 2006
@@ -0,0 +1,9 @@
+<html>
+ <head>
+ <title>bage b</title>
+ </head>
+<body>
+This is page b
+<a href="index.html">home</a>
+</body>
+</html>
\ No newline at end of file
Added: lucene/nutch/trunk/src/testresources/fetch-test-site/robots.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/robots.txt?rev=439610&view=auto
==============================================================================
(empty)