You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2007/01/11 14:25:44 UTC
svn commit: r495214 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/Crawl.java
src/java/org/apache/nutch/indexer/Indexer.java
Author: ab
Date: Thu Jan 11 05:25:43 2007
New Revision: 495214
URL: http://svn.apache.org/viewvc?view=rev&rev=495214
Log:
When indexing redirected pages, drop intermediate pages and only index the
final page.
Avoid NPEs in Crawl tool, when no URLs are generated or fetched.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=495214&r1=495213&r2=495214
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Jan 11 05:25:43 2007
@@ -123,6 +123,9 @@
39. NUTCH-421 - Allow predeterminate running order of indexing filters
(Alan Tanaman, siren)
+40. When indexing pages with redirection, drop all intermediate pages and
+ index only the final page. (ab)
+
Release 0.8 - 2006-07-25
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?view=diff&rev=495214&r1=495213&r2=495214
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Thu Jan 11 05:25:43 2007
@@ -113,8 +113,8 @@
// initialize crawlDb
injector.inject(crawlDb, rootUrlDir);
-
- for (int i = 0; i < depth; i++) { // generate new segment
+ int i;
+ for (i = 0; i < depth; i++) { // generate new segment
Path segment = generator.generate(crawlDb, segments, -1, topN, System
.currentTimeMillis(), false, false);
if (segment == null) {
@@ -127,14 +127,16 @@
}
crawlDbTool.update(crawlDb, new Path[]{segment}, true, true); // update crawldb
}
-
- linkDbTool.invert(linkDb, segments, true, true, false); // invert links
-
- // index, dedup & merge
- indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments));
- dedup.dedup(new Path[] { indexes });
- merger.merge(fs.listPaths(indexes), index, tmpDir);
+ if (i > 0) {
+ linkDbTool.invert(linkDb, segments, true, true, false); // invert links
+ // index, dedup & merge
+ indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments));
+ dedup.dedup(new Path[] { indexes });
+ merger.merge(fs.listPaths(indexes), index, tmpDir);
+ } else {
+ LOG.warn("No URLs to fetch - check your seed list and URL filters.");
+ }
if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=495214&r1=495213&r2=495214
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Thu Jan 11 05:25:43 2007
@@ -182,6 +182,7 @@
Inlinks inlinks = null;
CrawlDatum dbDatum = null;
CrawlDatum fetchDatum = null;
+ CrawlDatum redir = null;
ParseData parseData = null;
ParseText parseText = null;
while (values.hasNext()) {
@@ -194,6 +195,9 @@
dbDatum = datum;
else if (CrawlDatum.hasFetchStatus(datum))
fetchDatum = datum;
+ else if (CrawlDatum.STATUS_LINKED == datum.getStatus())
+ // redirected page
+ redir = datum;
else
throw new RuntimeException("Unexpected status: "+datum.getStatus());
} else if (value instanceof ParseData) {
@@ -204,6 +208,11 @@
LOG.warn("Unrecognized type: "+value.getClass());
}
}
+ if (redir != null) {
+ // XXX page was redirected - what should we do?
+ // XXX discard it for now
+ return;
+ }
if (fetchDatum == null || dbDatum == null
|| parseText == null || parseData == null) {