You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2014/05/26 12:47:12 UTC
svn commit: r1597556 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/java/org/apache/nutch/crawl/CrawlDb.java
Author: markus
Date: Mon May 26 10:47:11 2014
New Revision: 1597556
URL: http://svn.apache.org/r1597556
Log:
NUTCH-1786 CrawlDb should follow db.url.normalizers and db.url.filters
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1597556&r1=1597555&r2=1597556&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon May 26 10:47:11 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1786 CrawlDb should follow db.url.normalizers and db.url.filters (Diaa via markus)
+
* NUTCH-1757 ParserChecker to take custom metadata as input (jnioche)
* NUTCH-1676 Add rudimentary SSL support to protocol-http (jnioche, markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1597556&r1=1597555&r2=1597556&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon May 26 10:47:11 2014
@@ -76,11 +76,11 @@
NOTE: You should also check other related properties:
- http.robots.agents
- http.agent.description
- http.agent.url
- http.agent.email
- http.agent.version
+ http.robots.agents
+ http.agent.description
+ http.agent.url
+ http.agent.email
+ http.agent.version
and set their values appropriately.
@@ -345,7 +345,6 @@
</property>
<!-- web db properties -->
-
<property>
<name>db.fetch.interval.default</name>
<value>2592000</value>
@@ -450,6 +449,18 @@
</property>
<property>
+ <name>db.url.normalizers</name>
+ <value>false</value>
+ <description>Normalize urls when updating crawldb</description>
+</property>
+
+<property>
+ <name>db.url.filters</name>
+ <value>false</value>
+ <description>Filter urls when updating crawldb</description>
+</property>
+
+<property>
<name>db.update.max.inlinks</name>
<value>10000</value>
<description>Maximum number of inlinks to take into account when updating
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=1597556&r1=1597555&r2=1597556&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Mon May 26 10:47:11 2014
@@ -114,7 +114,9 @@ public class CrawlDb extends Configured
long end = System.currentTimeMillis();
LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
-
+/*
+ * Configure a new CrawlDb in a temp folder at crawlDb/<rand>
+ */
public static JobConf createJob(Configuration config, Path crawlDb)
throws IOException {
Path newCrawlDb =
@@ -180,12 +182,11 @@ public class CrawlDb extends Configured
return -1;
}
- boolean normalize = false;
- boolean filter = false;
+ boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING, false);
+ boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false);
+ boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
boolean force = false;
- boolean url404Purging = false;
final FileSystem fs = FileSystem.get(getConf());
- boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
HashSet<Path> dirs = new HashSet<Path>();
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-normalize")) {