You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2007/01/11 14:25:44 UTC

svn commit: r495214 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Crawl.java src/java/org/apache/nutch/indexer/Indexer.java

Author: ab
Date: Thu Jan 11 05:25:43 2007
New Revision: 495214

URL: http://svn.apache.org/viewvc?view=rev&rev=495214
Log:
When indexing redirected pages, drop intermediate pages and only index the
final page.

Avoid NPEs in Crawl tool, when no URLs are generated or fetched.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=495214&r1=495213&r2=495214
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Jan 11 05:25:43 2007
@@ -123,6 +123,9 @@
 39. NUTCH-421 - Allow predeterminate running order of indexing filters
     (Alan Tanaman, siren)
 
+40. When indexing pages with redirection, drop all intermediate pages and
+    index only the final page. (ab)
+
 
 Release 0.8 - 2006-07-25
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?view=diff&rev=495214&r1=495213&r2=495214
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Thu Jan 11 05:25:43 2007
@@ -113,8 +113,8 @@
       
     // initialize crawlDb
     injector.inject(crawlDb, rootUrlDir);
-      
-    for (int i = 0; i < depth; i++) {             // generate new segment
+    int i;
+    for (i = 0; i < depth; i++) {             // generate new segment
       Path segment = generator.generate(crawlDb, segments, -1, topN, System
           .currentTimeMillis(), false, false);
       if (segment == null) {
@@ -127,14 +127,16 @@
       }
       crawlDbTool.update(crawlDb, new Path[]{segment}, true, true); // update crawldb
     }
-      
-    linkDbTool.invert(linkDb, segments, true, true, false); // invert links
-
-    // index, dedup & merge
-    indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments));
-    dedup.dedup(new Path[] { indexes });
-    merger.merge(fs.listPaths(indexes), index, tmpDir);
+    if (i > 0) {
+      linkDbTool.invert(linkDb, segments, true, true, false); // invert links
 
+      // index, dedup & merge
+      indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments));
+      dedup.dedup(new Path[] { indexes });
+      merger.merge(fs.listPaths(indexes), index, tmpDir);
+    } else {
+      LOG.warn("No URLs to fetch - check your seed list and URL filters.");
+    }
     if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=495214&r1=495213&r2=495214
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Thu Jan 11 05:25:43 2007
@@ -182,6 +182,7 @@
     Inlinks inlinks = null;
     CrawlDatum dbDatum = null;
     CrawlDatum fetchDatum = null;
+    CrawlDatum redir = null;
     ParseData parseData = null;
     ParseText parseText = null;
     while (values.hasNext()) {
@@ -194,6 +195,9 @@
           dbDatum = datum;
         else if (CrawlDatum.hasFetchStatus(datum))
           fetchDatum = datum;
+        else if (CrawlDatum.STATUS_LINKED == datum.getStatus())
+          // redirected page
+          redir = datum;
         else
           throw new RuntimeException("Unexpected status: "+datum.getStatus());
       } else if (value instanceof ParseData) {
@@ -204,6 +208,11 @@
         LOG.warn("Unrecognized type: "+value.getClass());
       }
     }      
+    if (redir != null) {
+      // XXX page was redirected - what should we do?
+      // XXX discard it for now
+      return;
+    }
 
     if (fetchDatum == null || dbDatum == null
         || parseText == null || parseData == null) {