You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/09/02 14:38:51 UTC
svn commit: r439582 - in /lucene/nutch/trunk/src/test: crawl-tests.xml
org/apache/nutch/crawl/CrawlDBTestUtil.java
org/apache/nutch/crawl/TestGenerator.java
org/apache/nutch/crawl/TestInjector.java
Author: siren
Date: Sat Sep 2 05:38:50 2006
New Revision: 439582
URL: http://svn.apache.org/viewvc?rev=439582&view=rev
Log:
Add simple unit tests for injector and generator
Added:
lucene/nutch/trunk/src/test/crawl-tests.xml
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
Added: lucene/nutch/trunk/src/test/crawl-tests.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/crawl-tests.xml?rev=439582&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/crawl-tests.xml (added)
+++ lucene/nutch/trunk/src/test/crawl-tests.xml Sat Sep 2 05:38:50 2006
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+
+<!-- Configuration overrides used during unit tests. -->
+
+<configuration>
+
+<property>
+ <name>plugin.includes</name>
+ <value>urlfilter-suffix|scoring-opic</value>
+ <description>Enable required plugins.</description>
+</property>
+
+</configuration>
\ No newline at end of file
Added: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?rev=439582&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Sat Sep 2 05:38:50 2006
@@ -0,0 +1,69 @@
+package org.apache.nutch.crawl;
+
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.UTF8;
+
+public class CrawlDBTestUtil {
+
+ private static final Log LOG = LogFactory.getLog(CrawlDBTestUtil.class);
+
+ /**
+ * Creates synthetic crawldb
+ *
+ * @param fs
+ * filesystem where db will be created
+ * @param crawldb
+ * path were db will be created
+ * @param init
+ * urls to be inserted, objects are of type URLCrawlDatum
+ * @throws Exception
+ */
+ public static void createCrawlDb(FileSystem fs, Path crawldb, List<URLCrawlDatum> init)
+ throws Exception {
+ LOG.trace("* creating crawldb: " + crawldb);
+ Path dir = new Path(crawldb, CrawlDatum.DB_DIR_NAME);
+ MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000")
+ .toString(), UTF8.class, CrawlDatum.class);
+ Iterator<URLCrawlDatum> it = init.iterator();
+ while (it.hasNext()) {
+ URLCrawlDatum row = it.next();
+ LOG.info("adding:" + row.url.toString());
+ writer.append(new UTF8(row.url), row.datum);
+ }
+ writer.close();
+ }
+
+ /**
+ * For now we need to manually construct our Configuration, because we need to
+ * override the default one and it is currently not possible to use dynamically
+ * set values.
+ *
+ * @return
+ */
+ public static Configuration create(){
+ Configuration conf=new Configuration();
+ conf.addDefaultResource("nutch-default.xml");
+ conf.addFinalResource("crawl-tests.xml");
+ return conf;
+ }
+
+ public static class URLCrawlDatum {
+
+ UTF8 url;
+
+ CrawlDatum datum;
+
+ public URLCrawlDatum(UTF8 url, CrawlDatum datum) {
+ this.url = url;
+ this.datum = datum;
+ }
+ }
+}
Added: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=439582&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Sat Sep 2 05:38:50 2006
@@ -0,0 +1,147 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+
+import junit.framework.TestCase;
+
+/**
+ * Basic generator test:
+ * 1. Insert entries in crawldb
+ * 2. Generates entries to fetch
+ * 3. Verifies that number of generated urls match
+ * 4. Verifies that highest scoring urls are generated
+
+ * @author nutch-dev <nutch-dev at lucene.apache.org>
+ *
+ */
+public class TestGenerator extends TestCase {
+
+ Configuration conf;
+
+ Path dbDir;
+
+ Path segmentsDir;
+
+ FileSystem fs;
+
+ protected void setUp() throws Exception {
+ conf = CrawlDBTestUtil.create();
+ }
+
+ protected void tearDown() {
+ delete(dbDir);
+ delete(segmentsDir);
+ }
+
+ private void delete(Path p) {
+ try {
+ fs.delete(p);
+ } catch (IOException e) {
+ }
+ }
+
+ /**
+ * Test that generator generates fetchlish ordered by score (desc)
+ *
+ * @throws Exception
+ */
+ public void testGenerateHighest() throws Exception {
+
+ int NUM_RESULTS=2;
+
+ ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+ for(int i=0;i<=100;i++){
+ list.add(new CrawlDBTestUtil.URLCrawlDatum(new UTF8("http://aaa/" + pad(i)),
+ new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 1, i)));
+ }
+
+ fs = FileSystem.get(conf);
+ dbDir = new Path("test-crawldb-" + new java.util.Random().nextInt());
+ segmentsDir = new Path("test-crawldb-segments" + new java.util.Random().nextInt());
+ fs.mkdirs(dbDir);
+ fs.mkdirs(segmentsDir);
+
+ // create crawldb
+ CrawlDBTestUtil.createCrawlDb(fs, dbDir, list);
+
+ // generate segment
+ Generator g=new Generator(conf);
+ Path generatedSegment=g.generate(dbDir, segmentsDir,0,NUM_RESULTS, Long.MAX_VALUE);
+
+ Path fetchlist=new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME),"part-00000");
+
+ // verify results
+ SequenceFile.Reader reader=new SequenceFile.Reader(fs, fetchlist, conf);
+
+ ArrayList<URLCrawlDatum> l=new ArrayList<URLCrawlDatum>();
+
+ READ:
+ do {
+ UTF8 key=new UTF8();
+ CrawlDatum value=new CrawlDatum();
+ if(!reader.next(key, value)) break READ;
+ l.add(new URLCrawlDatum(key, value));
+ } while(true);
+
+ reader.close();
+
+ // sort urls by score desc
+ Collections.sort(l, new ScoreComparator());
+
+ //verify we got right amount of records
+ assertEquals(NUM_RESULTS, l.size());
+
+ //verify we have the highest scoring urls
+ assertEquals("http://aaa/100", (l.get(0).url.toString()));
+ assertEquals("http://aaa/099", (l.get(1).url.toString()));
+ }
+
+ private String pad(int i) {
+ String s=Integer.toString(i);
+ while(s.length()<3)
+ s="0" + s;
+ return s;
+ }
+
+ /**
+ * Comparator that sorts by score desc
+ */
+ public class ScoreComparator implements Comparator<URLCrawlDatum> {
+
+ public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) {
+
+ if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0)
+ return -1;
+ if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0)
+ return 1;
+
+ return 0;
+ }
+ }
+}
Added: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java?rev=439582&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Sat Sep 2 05:38:50 2006
@@ -0,0 +1,128 @@
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.UTF8;
+
+import junit.framework.TestCase;
+
+/**
+ * Basic injector test:
+ * 1. Creates a text file with urls
+ * 2. Injects them into crawldb
+ * 3. Reads crawldb entries and verifies contents
+ * 4. Injects more urls into webdb
+ * 5. Reads crawldb entries and verifies contents
+ *
+ * @author nutch-dev <nutch-dev at lucene.apache.org>
+ */
+public class TestInjector extends TestCase {
+
+ private FSDataOutputStream out;
+ private Configuration conf;
+ private FileSystem fs;
+ final static Path testdir=new Path("build/test/inject-test");
+ Path crawldbPath;
+ Path urlPath;
+
+ protected void setUp() throws Exception {
+ conf = CrawlDBTestUtil.create();
+ urlPath=new Path(testdir,"urls");
+ crawldbPath=new Path(testdir,"crawldb");
+ fs=FileSystem.get(conf);
+
+ }
+
+ protected void tearDown() throws IOException{
+ fs.delete(testdir);
+ }
+
+ public void testInject() throws IOException {
+ ArrayList<String> urls=new ArrayList<String>();
+ for(int i=0;i<100;i++) {
+ urls.add("http://zzz/" + i + ".html");
+ }
+ generateSeedList(urls);
+
+ Injector injector=new Injector(conf);
+ injector.inject(crawldbPath, urlPath);
+
+ // verify results
+ List<String>read=readCrawldb();
+
+ Collections.sort(read);
+ Collections.sort(urls);
+
+ assertEquals(urls.size(), read.size());
+
+ assertTrue(read.containsAll(urls));
+ assertTrue(urls.containsAll(read));
+
+ //inject more urls
+ ArrayList<String> urls2=new ArrayList<String>();
+ for(int i=0;i<100;i++) {
+ urls2.add("http://xxx/" + i + ".html");
+ }
+ generateSeedList(urls2);
+ injector.inject(crawldbPath, urlPath);
+ urls.addAll(urls2);
+
+ // verify results
+ read=readCrawldb();
+
+
+ Collections.sort(read);
+ Collections.sort(urls);
+
+ assertEquals(urls.size(), read.size());
+
+ assertTrue(read.containsAll(urls));
+ assertTrue(urls.containsAll(read));
+
+ }
+
+ /**
+ * Generate seedlist
+ * @throws IOException
+ */
+ private void generateSeedList(List<String> contents) throws IOException{
+ Path file=new Path(urlPath,"urls.txt");
+ fs.mkdirs(urlPath);
+ out=fs.create(file);
+ Iterator<String> iterator=contents.iterator();
+ while(iterator.hasNext()){
+ String url=iterator.next();
+ out.writeBytes(url);
+ out.writeBytes("\n");
+ }
+ out.flush();
+ out.close();
+ }
+
+ private List<String> readCrawldb() throws IOException{
+ Path dbfile=new Path(crawldbPath,CrawlDatum.DB_DIR_NAME + "/part-00000/data");
+ System.out.println("reading:" + dbfile);
+ SequenceFile.Reader reader=new SequenceFile.Reader(fs, dbfile, conf);
+ ArrayList<String> read=new ArrayList<String>();
+
+ READ:
+ do {
+ UTF8 key=new UTF8();
+ CrawlDatum value=new CrawlDatum();
+ if(!reader.next(key, value)) break READ;
+ read.add(key.toString());
+ } while(true);
+
+ return read;
+ }
+
+}