You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2011/09/09 13:13:55 UTC
svn commit: r1167096 - in /nutch/branches/branch-1.4: CHANGES.txt
conf/nutch-default.xml src/java/org/apache/nutch/crawl/CrawlDb.java
src/java/org/apache/nutch/crawl/CrawlDbFilter.java
Author: markus
Date: Fri Sep 9 11:13:54 2011
New Revision: 1167096
URL: http://svn.apache.org/viewvc?rev=1167096&view=rev
Log:
NUTCH-1101 Option to purge db_gone records from CrawlDB
Modified:
nutch/branches/branch-1.4/CHANGES.txt
nutch/branches/branch-1.4/conf/nutch-default.xml
nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java
nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
Modified: nutch/branches/branch-1.4/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1167096&r1=1167095&r2=1167096&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Fri Sep 9 11:13:54 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release 1.4 - Current development
+* NUTCH-1101 Option to purge db_gone records with updatedb (markus)
+
* NUTCH-1096 Empty (not null) ContentLength results in failure of fetch (Ferdy Galema via jnioche)
* NUTCH-1073 Rename parameters 'fetcher.threads.per.host.by.ip' and 'fetcher.threads.per.host' (jnioche)
Modified: nutch/branches/branch-1.4/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/nutch-default.xml?rev=1167096&r1=1167095&r2=1167096&view=diff
==============================================================================
--- nutch/branches/branch-1.4/conf/nutch-default.xml (original)
+++ nutch/branches/branch-1.4/conf/nutch-default.xml Fri Sep 9 11:13:54 2011
@@ -404,6 +404,14 @@
</property>
<property>
+ <name>db.update.purge.404</name>
+ <value>false</value>
+ <description>If true, updatedb will add purge records with status DB_GONE
+ from the CrawlDB.
+ </description>
+</property>
+
+<property>
<name>db.update.max.inlinks</name>
<value>10000</value>
<description>Maximum number of inlinks to take into account when updating
Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=1167096&r1=1167095&r2=1167096&view=diff
==============================================================================
--- nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java Fri Sep 9 11:13:54 2011
@@ -46,6 +46,8 @@ public class CrawlDb extends Configured
public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
+ public static final String CRAWLDB_PURGE_404 = "db.update.purge.404";
+
public static final String CURRENT_NAME = "current";
public static final String LOCK_NAME = ".locked";
@@ -57,7 +59,7 @@ public class CrawlDb extends Configured
}
public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter) throws IOException {
- boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
+ boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
update(crawlDb, segments, normalize, filter, additionsAllowed, false);
}
@@ -67,6 +69,14 @@ public class CrawlDb extends Configured
LockUtil.createLockFile(fs, lock, force);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
+
+ JobConf job = CrawlDb.createJob(getConf(), crawlDb);
+ job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
+ job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
+ job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
+
+ boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false);
+
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb update: starting at " + sdf.format(start));
LOG.info("CrawlDb update: db: " + crawlDb);
@@ -74,12 +84,9 @@ public class CrawlDb extends Configured
LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
LOG.info("CrawlDb update: URL normalizing: " + normalize);
LOG.info("CrawlDb update: URL filtering: " + filter);
+ LOG.info("CrawlDb update: 404 purging: " + url404Purging);
}
- JobConf job = CrawlDb.createJob(getConf(), crawlDb);
- job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
- job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
- job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
for (int i = 0; i < segments.length; i++) {
Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
@@ -166,11 +173,13 @@ public class CrawlDb extends Configured
System.err.println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)");
System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and segment");
System.err.println("\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs");
+
return -1;
}
boolean normalize = false;
boolean filter = false;
boolean force = false;
+ boolean url404Purging = false;
final FileSystem fs = FileSystem.get(getConf());
boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
HashSet<Path> dirs = new HashSet<Path>();
Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java?rev=1167096&r1=1167095&r2=1167096&view=diff
==============================================================================
--- nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java (original)
+++ nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java Fri Sep 9 11:13:54 2011
@@ -46,6 +46,8 @@ public class CrawlDbFilter implements Ma
private boolean urlNormalizers;
+ private boolean url404Purging;
+
private URLFilters filters;
private URLNormalizers normalizers;
@@ -57,6 +59,8 @@ public class CrawlDbFilter implements Ma
public void configure(JobConf job) {
urlFiltering = job.getBoolean(URL_FILTERING, false);
urlNormalizers = job.getBoolean(URL_NORMALIZING, false);
+ url404Purging = job.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);
+
if (urlFiltering) {
filters = new URLFilters(job);
}
@@ -75,6 +79,11 @@ public class CrawlDbFilter implements Ma
Reporter reporter) throws IOException {
String url = key.toString();
+
+ // https://issues.apache.org/jira/browse/NUTCH-1101 check status first, cheaper than normalizing or filtering
+ if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) {
+ url = null;
+ }
if (urlNormalizers) {
try {
url = normalizers.normalize(url, scope); // normalize the url