You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/11/19 19:48:40 UTC
svn commit: r476879 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/test/ src/test/org/apache/nutch/crawl/ src/test/org/apache/nutch/fetcher/

Author: siren
Date: Sun Nov 19 10:48:39 2006
New Revision: 476879

URL: http://svn.apache.org/viewvc?view=rev&rev=476879
Log:
NUTCH-403 Make URL filtering optional in Generator

Added:
    lucene/nutch/trunk/src/test/filter-all.txt
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
    lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
    lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=476879&r1=476878&r2=476879
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Nov 19 10:48:39 2006
@@ -78,6 +78,8 @@
 
 25. NUTCH-404 - Fix LinkDB Usage - implementation mismatch (siren)
 
+26. NUTCH-403 - Make URL filtering optional in Generator (siren)
+
 Release 0.8 - 2006-07-25
 
  0. Totally new architecture, based on hadoop

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?view=diff&rev=476879&r1=476878&r2=476879
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Sun Nov 19 10:48:39 2006
@@ -115,9 +115,8 @@
     injector.inject(crawlDb, rootUrlDir);
       
     for (int i = 0; i < depth; i++) {             // generate new segment
-      Path segment =
-        generator.generate(crawlDb, segments, -1,
-                                     topN, System.currentTimeMillis());
+      Path segment = generator.generate(crawlDb, segments, -1, topN, System
+          .currentTimeMillis(), false);
       fetcher.fetch(segment, threads);  // fetch it
       if (!Fetcher.isParsing(job)) {
         parseSegment.parse(segment);    // parse it, if needed

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=476879&r1=476878&r2=476879
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Sun Nov 19 10:48:39 2006
@@ -44,6 +44,7 @@
 /** Generates a subset of a crawl db to fetch. */
 public class Generator extends ToolBase {
 
+  public static final String CRAWL_GENERATE_FILTER = "crawl.generate.filter";
   public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip";
   public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host";
   public static final String CRAWL_TOP_N = "crawl.topN";
@@ -89,6 +90,7 @@
     private FloatWritable sortValue = new FloatWritable();
     private boolean byIP;
     private long dnsFailure = 0L;
+    private boolean filter;
 
     public void configure(JobConf job) {
       curTime = job.getLong(CRAWL_GEN_CUR_TIME, System.currentTimeMillis());
@@ -99,6 +101,7 @@
       normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
       scfilters = new ScoringFilters(job);
       hostPartitioner.configure(job);
+      filter = job.getBoolean(CRAWL_GENERATE_FILTER, true);
     }
 
     public void close() {}
@@ -108,13 +111,16 @@
                     OutputCollector output, Reporter reporter)
       throws IOException {
       Text url = (Text)key;
-      // don't generate URLs that don't pass URLFilters
-      try {
-        if (filters.filter(url.toString()) == null)
-          return;
-      } catch (URLFilterException e) {
-        if (LOG.isWarnEnabled()) {
-          LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")");
+      if (filter) {
+        // If filtering is on don't generate URLs that don't pass URLFilters
+        try {
+          if (filters.filter(url.toString()) == null)
+            return;
+        } catch (URLFilterException e) {
+          if (LOG.isWarnEnabled()) {
+            LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage()
+                + ")");
+          }
         }
       }
       CrawlDatum crawlDatum = (CrawlDatum)value;
@@ -291,13 +297,13 @@
   /** Generate fetchlists in a segment. */
   public Path generate(Path dbDir, Path segments)
     throws IOException {
-    return generate(dbDir, segments,
-                    -1, Long.MAX_VALUE, System.currentTimeMillis());
+    return generate(dbDir, segments, -1, Long.MAX_VALUE, System
+        .currentTimeMillis(), true);
   }
 
   /** Generate fetchlists in a segment. */
   public Path generate(Path dbDir, Path segments,
-                       int numLists, long topN, long curTime)
+                       int numLists, long topN, long curTime, boolean filter)
     throws IOException {
 
     Path tempDir =
@@ -308,10 +314,12 @@
     Path segment = new Path(segments, generateSegmentName());
     Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
 
-    if (LOG.isInfoEnabled()) {
-      LOG.info("Generator: starting");
-      LOG.info("Generator: segment: " + segment);
-      LOG.info("Generator: Selecting best-scoring urls due for fetch.");
+    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
+    LOG.info("Generator: starting");
+    LOG.info("Generator: segment: " + segment);
+    LOG.info("Generator: filtering: " + filter);
+    if (topN != Long.MAX_VALUE) {
+      LOG.info("Generator: topN: " + topN);
     }
 
     // map to inverted subset due for fetch, sort by link count
@@ -326,8 +334,9 @@
       LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
       numLists = 1;
     }
-    job.setLong("crawl.gen.curTime", curTime);
-    job.setLong("crawl.topN", topN);
+    job.setLong(CRAWL_GEN_CUR_TIME, curTime);
+    job.setLong(CRAWL_TOP_N, topN);
+    job.setBoolean(CRAWL_GENERATE_FILTER, filter);
 
     job.setInputPath(new Path(dbDir, CrawlDatum.DB_DIR_NAME));
     job.setInputFormat(SequenceFileInputFormat.class);
@@ -393,7 +402,7 @@
   
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
-      System.out.println("Usage: Generator <crawldb> <segments_dir> [-topN N] [-numFetchers numFetchers] [-adddays numDays]");
+      System.out.println("Usage: Generator <crawldb> <segments_dir> [-topN N] [-numFetchers numFetchers] [-adddays numDays] [-noFilter]");
       return -1;
     }
 
@@ -402,6 +411,7 @@
     long curTime = System.currentTimeMillis();
     long topN = Long.MAX_VALUE;
     int numFetchers = -1;
+    boolean filter = true;
 
     for (int i = 2; i < args.length; i++) {
       if ("-topN".equals(args[i])) {
@@ -413,14 +423,14 @@
       } else if ("-adddays".equals(args[i])) {
         long numDays = Integer.parseInt(args[i+1]);
         curTime += numDays * 1000L * 60 * 60 * 24;
+      } else if ("-noFilter".equals(args[i])) {
+        filter = false;
       }
+      
     }
 
-    if ((LOG.isInfoEnabled()) && (topN != Long.MAX_VALUE)) {
-      LOG.info("topN: " + topN);
-    }
     try {
-      generate(dbDir, segmentsDir, numFetchers, topN, curTime);
+      generate(dbDir, segmentsDir, numFetchers, topN, curTime, filter);
       return 0;
     } catch (Exception e) {
       LOG.fatal("Generator: " + StringUtils.stringifyException(e));

Added: lucene/nutch/trunk/src/test/filter-all.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/filter-all.txt?view=auto&rev=476879
==============================================================================
--- lucene/nutch/trunk/src/test/filter-all.txt (added)
+++ lucene/nutch/trunk/src/test/filter-all.txt Sun Nov 19 10:48:39 2006
@@ -0,0 +1,7 @@
+# Config file for urlfilter-suffix plugin
+# Filter away all urls
+
+# case-insensitive, disallow unknown suffixes
+-I
+
+# allow these

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?view=diff&rev=476879&r1=476878&r2=476879
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Sun Nov 19 10:48:39 2006
@@ -85,7 +85,7 @@
 
     createCrawlDB(list);
 
-    Path generatedSegment = generateFetchlist(NUM_RESULTS, conf);
+    Path generatedSegment = generateFetchlist(NUM_RESULTS, conf, false);
 
     Path fetchlist = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -145,7 +145,8 @@
 
     Configuration myConfiguration = new Configuration(conf);
     myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1);
-    Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+    Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+        myConfiguration, false);
 
     Path fetchlistPath = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -155,10 +156,10 @@
     // verify we got right amount of records
     assertEquals(1, fetchList.size());
 
-    
     myConfiguration = new Configuration(conf);
     myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2);
-    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+        false);
 
     fetchlistPath = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -170,7 +171,8 @@
 
     myConfiguration = new Configuration(conf);
     myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3);
-    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+        false);
 
     fetchlistPath = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -180,7 +182,7 @@
     // verify we got right amount of records
     assertEquals(3, fetchList.size());
   }
-  
+
   /**
    * Test that generator obeys the property "generate.max.per.host" and
    * "generate.max.per.host.by.ip".
@@ -189,12 +191,9 @@
   public void testGenerateHostIPLimit() throws Exception{
     ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
 
-    list.add(createURLCrawlDatum("http://www.example.com/index.html",
-        1, 1));
-    list.add(createURLCrawlDatum("http://www.example.net/index.html",
-        1, 1));
-    list.add(createURLCrawlDatum("http://www.example.org/index.html",
-        1, 1));
+    list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1));
+    list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1));
+    list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1));
 
     createCrawlDB(list);
 
@@ -202,7 +201,8 @@
     myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1);
     myConfiguration.setBoolean(Generator.GENERATE_MAX_PER_HOST_BY_IP, true);
 
-    Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+    Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+        myConfiguration, false);
 
     Path fetchlistPath = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -214,7 +214,7 @@
 
     myConfiguration = new Configuration(myConfiguration);
     myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2);
-    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
 
     fetchlistPath = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -226,7 +226,8 @@
 
     myConfiguration = new Configuration(myConfiguration);
     myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3);
-    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+        false);
 
     fetchlistPath = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -237,6 +238,47 @@
     assertEquals(3, fetchList.size());
   }
 
+  /**
+   * Test generator obeys the filter setting.
+   * @throws Exception 
+   * @throws IOException 
+   */
+  public void testFilter() throws IOException, Exception{
+
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+    list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1));
+    list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1));
+    list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1));
+
+    createCrawlDB(list);
+
+    Configuration myConfiguration = new Configuration(conf);
+    myConfiguration.set("urlfilter.suffix.file", "filter-all.txt");
+
+    Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+        myConfiguration, true);
+
+    Path fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+    // verify all got filtered out
+    assertEquals(0, fetchList.size());
+
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
+
+    fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    fetchList = readContents(fetchlistPath);
+
+    // verify nothing got filtered
+    assertEquals(list.size(), fetchList.size());
+
+  }
+
 
   /**
    * Read contents of fetchlist.
@@ -270,11 +312,12 @@
    * @return path to generated segment
    * @throws IOException
    */
-  private Path generateFetchlist(int numResults, Configuration config) throws IOException {
+  private Path generateFetchlist(int numResults, Configuration config,
+      boolean filter) throws IOException {
     // generate segment
     Generator g = new Generator(config);
     Path generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults,
-        Long.MAX_VALUE);
+        Long.MAX_VALUE, filter);
     return generatedSegment;
   }
 

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?view=diff&rev=476879&r1=476878&r2=476879
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Sun Nov 19 10:48:39 2006
@@ -87,7 +87,8 @@
 
     //generate
     Generator g=new Generator(conf);
-    Path generatedSegment=g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE);
+    Path generatedSegment = g.generate(crawldbPath, segmentsPath, 1,
+        Long.MAX_VALUE, Long.MAX_VALUE, false);
 
     long time=System.currentTimeMillis();
     //fetch