You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/11/15 20:42:22 UTC
svn commit: r475378 - in /lucene/nutch/trunk/src:
java/org/apache/nutch/crawl/Generator.java
test/org/apache/nutch/crawl/TestGenerator.java
Author: siren
Date: Wed Nov 15 11:42:22 2006
New Revision: 475378
URL: http://svn.apache.org/viewvc?view=rev&rev=475378
Log:
added more junit tests
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=475378&r1=475377&r2=475378
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Nov 15 11:42:22 2006
@@ -44,6 +44,10 @@
/** Generates a subset of a crawl db to fetch. */
public class Generator extends ToolBase {
+ public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip";
+ public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host";
+ public static final String CRAWL_TOP_N = "crawl.topN";
+ public static final String CRAWL_GEN_CUR_TIME = "crawl.gen.curTime";
public static final Log LOG = LogFactory.getLog(Generator.class);
public static class SelectorEntry implements Writable {
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?view=diff&rev=475378&r1=475377&r2=475378
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Wed Nov 15 11:42:22 2006
@@ -31,12 +31,10 @@
import junit.framework.TestCase;
/**
- * Basic generator test:
- * 1. Insert entries in crawldb
- * 2. Generates entries to fetch
- * 3. Verifies that number of generated urls match
- * 4. Verifies that highest scoring urls are generated
-
+ * Basic generator test. 1. Insert entries in crawldb 2. Generates entries to
+ * fetch 3. Verifies that number of generated urls match 4. Verifies that
+ * highest scoring urls are generated
+ *
* @author nutch-dev <nutch-dev at lucene.apache.org>
*
*/
@@ -50,11 +48,11 @@
FileSystem fs;
- final static Path testdir=new Path("build/test/generator-test");
+ final static Path testdir = new Path("build/test/generator-test");
protected void setUp() throws Exception {
conf = CrawlDBTestUtil.createConfiguration();
- fs=FileSystem.get(conf);
+ fs = FileSystem.get(conf);
fs.delete(testdir);
}
@@ -70,81 +68,243 @@
}
/**
- * Test that generator generates fetchlish ordered by score (desc)
- *
+ * Test that generator generates fetchlish ordered by score (desc).
+ *
* @throws Exception
*/
public void testGenerateHighest() throws Exception {
- int NUM_RESULTS=2;
-
+ final int NUM_RESULTS = 2;
+
ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
-
- for(int i=0;i<=100;i++){
- list.add(new CrawlDBTestUtil.URLCrawlDatum(new Text("http://aaa/" + pad(i)),
- new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 1, i)));
+
+ for (int i = 0; i <= 100; i++) {
+ list.add(createURLCrawlDatum("http://aaa/" + pad(i),
+ 1, i));
}
-
- dbDir = new Path(testdir, "crawldb");
- segmentsDir = new Path(testdir, "segments");
- fs.mkdirs(dbDir);
- fs.mkdirs(segmentsDir);
-
- // create crawldb
- CrawlDBTestUtil.createCrawlDb(fs, dbDir, list);
-
- // generate segment
- Generator g=new Generator(conf);
- Path generatedSegment=g.generate(dbDir, segmentsDir, -1, NUM_RESULTS, Long.MAX_VALUE);
-
- Path fetchlist=new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME), "part-00000");
-
- // verify results
- SequenceFile.Reader reader=new SequenceFile.Reader(fs, fetchlist, conf);
-
- ArrayList<URLCrawlDatum> l=new ArrayList<URLCrawlDatum>();
-
- READ:
- do {
- Text key=new Text();
- CrawlDatum value=new CrawlDatum();
- if(!reader.next(key, value)) break READ;
- l.add(new URLCrawlDatum(key, value));
- } while(true);
- reader.close();
+ createCrawlDB(list);
+ Path generatedSegment = generateFetchlist(NUM_RESULTS, conf);
+
+ Path fetchlist = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ ArrayList<URLCrawlDatum> l = readContents(fetchlist);
+
// sort urls by score desc
Collections.sort(l, new ScoreComparator());
- //verify we got right amount of records
+ // verify we got right amount of records
assertEquals(NUM_RESULTS, l.size());
- //verify we have the highest scoring urls
+ // verify we have the highest scoring urls
assertEquals("http://aaa/100", (l.get(0).url.toString()));
assertEquals("http://aaa/099", (l.get(1).url.toString()));
}
private String pad(int i) {
- String s=Integer.toString(i);
- while(s.length()<3)
- s="0" + s;
+ String s = Integer.toString(i);
+ while (s.length() < 3) {
+ s = "0" + s;
+ }
return s;
}
/**
- * Comparator that sorts by score desc
+ * Comparator that sorts by score desc.
*/
public class ScoreComparator implements Comparator<URLCrawlDatum> {
public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) {
-
- if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0)
+ if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0) {
return -1;
- if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0)
+ }
+ if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0) {
return 1;
-
+ }
return 0;
}
+ }
+
+ /**
+ * Test that generator obeys the property "generate.max.per.host".
+ * @throws Exception
+ */
+ public void testGenerateHostLimit() throws Exception{
+ ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+ list.add(createURLCrawlDatum("http://www.example.com/index1.html",
+ 1, 1));
+ list.add(createURLCrawlDatum("http://www.example.com/index2.html",
+ 1, 1));
+ list.add(createURLCrawlDatum("http://www.example.com/index3.html",
+ 1, 1));
+
+ createCrawlDB(list);
+
+ Configuration myConfiguration = new Configuration(conf);
+ myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1);
+ Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+
+ Path fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+ // verify we got right amount of records
+ assertEquals(1, fetchList.size());
+
+
+ myConfiguration = new Configuration(conf);
+ myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2);
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+
+ fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ fetchList = readContents(fetchlistPath);
+
+ // verify we got right amount of records
+ assertEquals(2, fetchList.size());
+
+ myConfiguration = new Configuration(conf);
+ myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3);
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+
+ fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ fetchList = readContents(fetchlistPath);
+
+ // verify we got right amount of records
+ assertEquals(3, fetchList.size());
+ }
+
+ /**
+ * Test that generator obeys the property "generate.max.per.host".
+ * @throws Exception
+ */
+ public void testGenerateHostIPLimit() throws Exception{
+ ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+ list.add(createURLCrawlDatum("http://www.example.com/index.html",
+ 1, 1));
+ list.add(createURLCrawlDatum("http://www.example.net/index.html",
+ 1, 1));
+ list.add(createURLCrawlDatum("http://www.example.org/index.html",
+ 1, 1));
+
+ createCrawlDB(list);
+
+ Configuration myConfiguration = new Configuration(conf);
+ myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1);
+ myConfiguration.setBoolean(Generator.GENERATE_MAX_PER_HOST_BY_IP, true);
+
+ Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+
+ Path fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+ // verify we got right amount of records
+ assertEquals(1, fetchList.size());
+
+ myConfiguration = new Configuration(myConfiguration);
+ myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2);
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+
+ fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ fetchList = readContents(fetchlistPath);
+
+ // verify we got right amount of records
+ assertEquals(2, fetchList.size());
+
+ myConfiguration = new Configuration(myConfiguration);
+ myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3);
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+
+ fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ fetchList = readContents(fetchlistPath);
+
+ // verify we got right amount of records
+ assertEquals(3, fetchList.size());
+ }
+
+
+ /**
+ * Read contents of fetchlist.
+ * @param fetchlist path to Generated fetchlist
+ * @return Generated {@link URLCrawlDatum} objects
+ * @throws IOException
+ */
+ private ArrayList<URLCrawlDatum> readContents(Path fetchlist) throws IOException {
+ // verify results
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf);
+
+ ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
+
+ READ: do {
+ Text key = new Text();
+ CrawlDatum value = new CrawlDatum();
+ if (!reader.next(key, value)) {
+ break READ;
+ }
+ l.add(new URLCrawlDatum(key, value));
+ } while (true);
+
+ reader.close();
+ return l;
+ }
+
+ /**
+ * Generate Fetchlist.
+ * @param numResults number of results to generate
+ * @param config Configuration to use
+ * @return path to generated segment
+ * @throws IOException
+ */
+ private Path generateFetchlist(int numResults, Configuration config) throws IOException {
+ // generate segment
+ Generator g = new Generator(config);
+ Path generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults,
+ Long.MAX_VALUE);
+ return generatedSegment;
+ }
+
+ /**
+ * Creates CrawlDB.
+ *
+ * @param list database contents
+ * @throws IOException
+ * @throws Exception
+ */
+ private void createCrawlDB(ArrayList<URLCrawlDatum> list) throws IOException,
+ Exception {
+ dbDir = new Path(testdir, "crawldb");
+ segmentsDir = new Path(testdir, "segments");
+ fs.mkdirs(dbDir);
+ fs.mkdirs(segmentsDir);
+
+ // create crawldb
+ CrawlDBTestUtil.createCrawlDb(fs, dbDir, list);
+ }
+
+ /**
+ * Constructs new {@link URLCrawlDatum} from submitted parameters.
+ * @param url url to use
+ * @param fetchInterval {@link CrawlDatum#setFetchInterval(float)}
+ * @param score {@link CrawlDatum#setScore(float)}
+ * @return Constructed object
+ */
+ private URLCrawlDatum createURLCrawlDatum(final String url,
+ final float fetchInterval, final float score) {
+ return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum(
+ CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score));
}
}