You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/11/15 20:42:22 UTC

svn commit: r475378 - in /lucene/nutch/trunk/src: java/org/apache/nutch/crawl/Generator.java test/org/apache/nutch/crawl/TestGenerator.java

Author: siren
Date: Wed Nov 15 11:42:22 2006
New Revision: 475378

URL: http://svn.apache.org/viewvc?view=rev&rev=475378
Log:
added more junit tests

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
    lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=475378&r1=475377&r2=475378
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Nov 15 11:42:22 2006
@@ -44,6 +44,10 @@
 /** Generates a subset of a crawl db to fetch. */
 public class Generator extends ToolBase {
 
+  public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip";
+  public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host";
+  public static final String CRAWL_TOP_N = "crawl.topN";
+  public static final String CRAWL_GEN_CUR_TIME = "crawl.gen.curTime";
   public static final Log LOG = LogFactory.getLog(Generator.class);
   
   public static class SelectorEntry implements Writable {

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?view=diff&rev=475378&r1=475377&r2=475378
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Wed Nov 15 11:42:22 2006
@@ -31,12 +31,10 @@
 import junit.framework.TestCase;
 
 /**
- * Basic generator test:
- * 1. Insert entries in crawldb
- * 2. Generates entries to fetch
- * 3. Verifies that number of generated urls match
- * 4. Verifies that highest scoring urls are generated 
- 
+ * Basic generator test. 1. Insert entries in crawldb 2. Generates entries to
+ * fetch 3. Verifies that number of generated urls match 4. Verifies that
+ * highest scoring urls are generated
+ *
  * @author nutch-dev <nutch-dev at lucene.apache.org>
  *
  */
@@ -50,11 +48,11 @@
 
   FileSystem fs;
 
-  final static Path testdir=new Path("build/test/generator-test");
+  final static Path testdir = new Path("build/test/generator-test");
 
   protected void setUp() throws Exception {
     conf = CrawlDBTestUtil.createConfiguration();
-    fs=FileSystem.get(conf);
+    fs = FileSystem.get(conf);
     fs.delete(testdir);
   }
 
@@ -70,81 +68,243 @@
   }
 
   /**
-   * Test that generator generates fetchlish ordered by score (desc)
-   * 
+   * Test that generator generates fetchlish ordered by score (desc).
+   *
    * @throws Exception
    */
   public void testGenerateHighest() throws Exception {
 
-    int NUM_RESULTS=2;
- 
+    final int NUM_RESULTS = 2;
+
     ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
-    
-    for(int i=0;i<=100;i++){
-      list.add(new CrawlDBTestUtil.URLCrawlDatum(new Text("http://aaa/" + pad(i)),
-        new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 1, i)));
+
+    for (int i = 0; i <= 100; i++) {
+      list.add(createURLCrawlDatum("http://aaa/" + pad(i),
+          1, i));
     }
-    
-    dbDir = new Path(testdir, "crawldb");
-    segmentsDir = new Path(testdir, "segments");
-    fs.mkdirs(dbDir);
-    fs.mkdirs(segmentsDir);
-    
-    // create crawldb
-    CrawlDBTestUtil.createCrawlDb(fs, dbDir, list);
-    
-    // generate segment
-    Generator g=new Generator(conf);
-    Path generatedSegment=g.generate(dbDir, segmentsDir, -1, NUM_RESULTS, Long.MAX_VALUE);
-    
-    Path fetchlist=new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME), "part-00000");
-    
-    // verify results
-    SequenceFile.Reader reader=new SequenceFile.Reader(fs, fetchlist, conf);
-    
-    ArrayList<URLCrawlDatum> l=new ArrayList<URLCrawlDatum>();
-    
-    READ:
-      do {
-      Text key=new Text();
-      CrawlDatum value=new CrawlDatum();
-      if(!reader.next(key, value)) break READ;
-      l.add(new URLCrawlDatum(key, value));
-    } while(true);
 
-    reader.close();
+    createCrawlDB(list);
 
+    Path generatedSegment = generateFetchlist(NUM_RESULTS, conf);
+
+    Path fetchlist = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    ArrayList<URLCrawlDatum> l = readContents(fetchlist);
+    
     // sort urls by score desc
     Collections.sort(l, new ScoreComparator());
 
-    //verify we got right amount of records
+    // verify we got right amount of records
     assertEquals(NUM_RESULTS, l.size());
 
-    //verify we have the highest scoring urls
+    // verify we have the highest scoring urls
     assertEquals("http://aaa/100", (l.get(0).url.toString()));
     assertEquals("http://aaa/099", (l.get(1).url.toString()));
   }
 
   private String pad(int i) {
-    String s=Integer.toString(i);
-    while(s.length()<3)
-      s="0" + s;
+    String s = Integer.toString(i);
+    while (s.length() < 3) {
+      s = "0" + s;
+    }
     return s;
   }
 
   /**
-   * Comparator that sorts by score desc
+   * Comparator that sorts by score desc.
    */
   public class ScoreComparator implements Comparator<URLCrawlDatum> {
 
     public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) {
-
-      if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0)
+      if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0) {
         return -1;
-      if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0)
+      }
+      if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0) {
         return 1;
-
+      }
       return 0;
     }
+  }
+
+  /**
+   * Test that generator obeys the property "generate.max.per.host".
+   * @throws Exception 
+   */
+  public void testGenerateHostLimit() throws Exception{
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+    list.add(createURLCrawlDatum("http://www.example.com/index1.html",
+        1, 1));
+    list.add(createURLCrawlDatum("http://www.example.com/index2.html",
+        1, 1));
+    list.add(createURLCrawlDatum("http://www.example.com/index3.html",
+        1, 1));
+
+    createCrawlDB(list);
+
+    Configuration myConfiguration = new Configuration(conf);
+    myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1);
+    Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+
+    Path fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    assertEquals(1, fetchList.size());
+
+    
+    myConfiguration = new Configuration(conf);
+    myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+
+    fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    assertEquals(2, fetchList.size());
+
+    myConfiguration = new Configuration(conf);
+    myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+
+    fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    assertEquals(3, fetchList.size());
+  }
+  
+  /**
+   * Test that generator obeys the property "generate.max.per.host".
+   * @throws Exception 
+   */
+  public void testGenerateHostIPLimit() throws Exception{
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+    list.add(createURLCrawlDatum("http://www.example.com/index.html",
+        1, 1));
+    list.add(createURLCrawlDatum("http://www.example.net/index.html",
+        1, 1));
+    list.add(createURLCrawlDatum("http://www.example.org/index.html",
+        1, 1));
+
+    createCrawlDB(list);
+
+    Configuration myConfiguration = new Configuration(conf);
+    myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1);
+    myConfiguration.setBoolean(Generator.GENERATE_MAX_PER_HOST_BY_IP, true);
+
+    Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+
+    Path fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    assertEquals(1, fetchList.size());
+
+    myConfiguration = new Configuration(myConfiguration);
+    myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+
+    fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    assertEquals(2, fetchList.size());
+
+    myConfiguration = new Configuration(myConfiguration);
+    myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+
+    fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    assertEquals(3, fetchList.size());
+  }
+
+
+  /**
+   * Read contents of fetchlist.
+   * @param fetchlist  path to Generated fetchlist
+   * @return Generated {@link URLCrawlDatum} objects
+   * @throws IOException
+   */
+  private ArrayList<URLCrawlDatum> readContents(Path fetchlist) throws IOException {
+    // verify results
+    SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf);
+
+    ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
+
+    READ: do {
+      Text key = new Text();
+      CrawlDatum value = new CrawlDatum();
+      if (!reader.next(key, value)) {
+        break READ;
+      }
+      l.add(new URLCrawlDatum(key, value));
+    } while (true);
+
+    reader.close();
+    return l;
+  }
+
+  /**
+   * Generate Fetchlist.
+   * @param numResults number of results to generate
+   * @param config Configuration to use
+   * @return path to generated segment
+   * @throws IOException
+   */
+  private Path generateFetchlist(int numResults, Configuration config) throws IOException {
+    // generate segment
+    Generator g = new Generator(config);
+    Path generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults,
+        Long.MAX_VALUE);
+    return generatedSegment;
+  }
+
+  /**
+   * Creates CrawlDB.
+   *
+   * @param list database contents
+   * @throws IOException
+   * @throws Exception
+   */
+  private void createCrawlDB(ArrayList<URLCrawlDatum> list) throws IOException,
+      Exception {
+    dbDir = new Path(testdir, "crawldb");
+    segmentsDir = new Path(testdir, "segments");
+    fs.mkdirs(dbDir);
+    fs.mkdirs(segmentsDir);
+
+    // create crawldb
+    CrawlDBTestUtil.createCrawlDb(fs, dbDir, list);
+  }
+
+  /**
+   * Constructs new {@link URLCrawlDatum} from submitted parameters.
+   * @param url url to use
+   * @param fetchInterval {@link CrawlDatum#setFetchInterval(float)}
+   * @param score {@link CrawlDatum#setScore(float)}
+   * @return Constructed object
+   */
+  private URLCrawlDatum createURLCrawlDatum(final String url,
+      final float fetchInterval, final float score) {
+    return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum(
+        CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score));
   }
 }